From 39a8a2645470e22073d1da41fb1ee9dbc35364e6 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 3 Aug 2025 01:32:44 +0530 Subject: [PATCH 01/73] feat: added the skeleton structure of the x86 module --- crates/intrinsic-test/Cargo.toml | 2 + crates/intrinsic-test/src/main.rs | 2 + crates/intrinsic-test/src/x86/intrinsic.rs | 43 ++++++++++++++++++++ crates/intrinsic-test/src/x86/mod.rs | 31 ++++++++++++++ crates/intrinsic-test/src/x86/types.rs | 37 +++++++++++++++++ crates/intrinsic-test/src/x86/xml_parser.rs | 45 +++++++++++++++++++++ 6 files changed, 160 insertions(+) create mode 100644 crates/intrinsic-test/src/x86/intrinsic.rs create mode 100644 crates/intrinsic-test/src/x86/mod.rs create mode 100644 crates/intrinsic-test/src/x86/types.rs create mode 100644 crates/intrinsic-test/src/x86/xml_parser.rs diff --git a/crates/intrinsic-test/Cargo.toml b/crates/intrinsic-test/Cargo.toml index fbbf90e140..df4f7fe706 100644 --- a/crates/intrinsic-test/Cargo.toml +++ b/crates/intrinsic-test/Cargo.toml @@ -19,3 +19,5 @@ pretty_env_logger = "0.5.0" rayon = "1.5.0" diff = "0.1.12" itertools = "0.14.0" +quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] } +serde-xml-rs = "0.8.0" diff --git a/crates/intrinsic-test/src/main.rs b/crates/intrinsic-test/src/main.rs index 44d7aafd82..d780e35160 100644 --- a/crates/intrinsic-test/src/main.rs +++ b/crates/intrinsic-test/src/main.rs @@ -3,10 +3,12 @@ extern crate log; mod arm; mod common; +mod x86; use arm::ArmArchitectureTest; use common::SupportedArchitectureTest; use common::cli::{Cli, ProcessedCli}; +use x86::X86ArchitectureTest; fn main() { pretty_env_logger::init(); diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs new file mode 100644 index 0000000000..27eca89406 --- /dev/null +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -0,0 +1,43 @@ +use crate::common::argument::ArgumentList; +use crate::common::indentation::Indentation; +use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; +use crate::common::intrinsic_helpers::IntrinsicType; +use std::ops::{Deref, DerefMut}; + +#[derive(Debug, Clone, PartialEq)] +pub struct X86IntrinsicType(pub IntrinsicType); + +impl Deref for X86IntrinsicType { + type Target = IntrinsicType; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for X86IntrinsicType { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl IntrinsicDefinition for Intrinsic { + fn arguments(&self) -> ArgumentList { + self.arguments.clone() + } + + fn results(&self) -> X86IntrinsicType { + self.results.clone() + } + + fn name(&self) -> String { + self.name.clone() + } + + /// Generates a std::cout for the intrinsics results that will match the + /// rust debug output format for the return type. The generated line assumes + /// there is an int i in scope which is the current pass number. + fn print_result_c(&self, _indentation: Indentation, _additional: &str) -> String { + todo!("print_result_c in Intrinsic needs to be implemented!"); + } +} \ No newline at end of file diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs new file mode 100644 index 0000000000..84499b5c4b --- /dev/null +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -0,0 +1,31 @@ +mod intrinsic; +mod types; +mod xml_parser; + +use crate::common::SupportedArchitectureTest; +use crate::common::cli::ProcessedCli; +use crate::common::intrinsic::Intrinsic; +use intrinsic::X86IntrinsicType; + +pub struct X86ArchitectureTest { + intrinsics: Vec>, + cli_options: ProcessedCli, +} + +impl SupportedArchitectureTest for X86ArchitectureTest { + fn create(cli_options: ProcessedCli) -> Box { + todo!("create in X86ArchitectureTest is not implemented") + } + + fn build_c_file(&self) -> bool { + todo!("build_c_file in X86ArchitectureTest is not implemented") + } + + fn build_rust_file(&self) -> bool { + todo!("build_rust_file in X86ArchitectureTest is not implemented") + } + + fn compare_outputs(&self) -> bool { + todo!("compare_outputs in X86ArchitectureTest is not implemented") + } +} \ No newline at end of file diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs new file mode 100644 index 0000000000..3dd205c9cb --- /dev/null +++ b/crates/intrinsic-test/src/x86/types.rs @@ -0,0 +1,37 @@ +use super::intrinsic::X86IntrinsicType; +use crate::common::cli::Language; +use crate::common::intrinsic_helpers::IntrinsicTypeDefinition; +use crate::x86::xml_parser::Parameter; + +impl IntrinsicTypeDefinition for X86IntrinsicType { + /// Gets a string containing the type in C format. + /// This function assumes that this value is present in the metadata hashmap. + fn c_type(&self) -> String { + todo!("c_type from IntrinsicTypeDefinition is not defined!") + } + + fn c_single_vector_type(&self) -> String { + // matches __m128, __m256 and similar types + todo!("c_type from IntrinsicTypeDefinition is not defined!") + } + + /// Determines the load function for this type. + fn get_load_function(&self, _language: Language) -> String { + todo!("get_load_function from IntrinsicTypeDefinition is not defined!") + } + + /// Determines the get lane function for this type. + fn get_lane_function(&self) -> String { + todo!("get_lane_function for X86IntrinsicType needs to be implemented!"); + } + + fn from_c(s: &str, target: &str) -> Result { + todo!("from_c from IntrinsicTypeDefinition is not defined!") + } +} + +impl X86IntrinsicType { + pub fn from_param(param: &Parameter) -> Result { + todo!("from_param from X86IntrinsicType is not defined!") + } +} \ No newline at end of file diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs new file mode 100644 index 0000000000..a6b4eb8382 --- /dev/null +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -0,0 +1,45 @@ +use serde::{Deserialize, Deserializer}; + + +// Custom deserializer function to convert strings to u32 +fn string_to_u32<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + return s.as_str().parse::().or(Ok(0u32)); +} + +#[derive(Deserialize)] +struct Data { + #[serde(rename = "intrinsic", default)] + intrinsics: Vec, +} + +#[derive(Deserialize)] +struct XMLIntrinsic { + #[serde(rename = "return")] + return_data: Parameter, + #[serde(rename = "@name")] + name: String, + // #[serde(rename = "@tech")] + // tech: String, + #[serde(rename = "CPUID", default)] + cpuid: Vec, + #[serde(rename = "parameter", default)] + parameters: Vec, +} + +#[derive(Deserialize)] +pub struct Parameter { + #[serde(rename = "@varname")] + pub var_name: String, + #[serde(rename = "@type")] + pub type_data: String, + #[serde(rename = "@etype", default)] + pub etype: String, + #[serde(rename = "@memwidth", default, deserialize_with = "string_to_u32")] + pub memwidth: u32, + #[serde(rename = "@immtype", default)] + pub imm_type: String, +} From acf67c2a2b0b4d6dc559b7c15299e93ce3c6d59e Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 3 Aug 2025 01:35:52 +0530 Subject: [PATCH 02/73] feat: added the XML intrinsic parser for x86 --- crates/intrinsic-test/src/x86/constraint.rs | 20 ++++++ crates/intrinsic-test/src/x86/mod.rs | 1 + crates/intrinsic-test/src/x86/xml_parser.rs | 70 +++++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 crates/intrinsic-test/src/x86/constraint.rs diff --git a/crates/intrinsic-test/src/x86/constraint.rs b/crates/intrinsic-test/src/x86/constraint.rs new file mode 100644 index 0000000000..321ee89b6c --- /dev/null +++ b/crates/intrinsic-test/src/x86/constraint.rs @@ -0,0 +1,20 @@ +use crate::common::constraint::Constraint; + +pub fn map_constraints(imm_type: &String) -> Option { + match imm_type.as_str() { + "_MM_FROUND" => Some(Constraint::Range(0..4)), + "_MM_INDEX_SCALE" => Some(Constraint::Set(vec![1, 2, 4, 8])), + "_MM_CMPINT" => Some(Constraint::Range(0..8)), + "_MM_REDUCE" => Some(Constraint::Range(0..8)), + "_MM_FROUND_SAE" => Some(Constraint::Range(0..8)), + "_MM_MANTISSA_NORM" => Some(Constraint::Range(0..4)), + "_MM_MANTISSA_NORM_ENUM" => Some(Constraint::Range(0..4)), + "_MM_MANTISSA_SIGN" => Some(Constraint::Range(0..3)), + "_MM_PERM" => Some(Constraint::Range(0..256)), + "_MM_PERM_ENUM" => Some(Constraint::Range(0..256)), + "_MM_CMPINT_ENUM" => Some(Constraint::Range(0..8)), + "_MM_ROUND_MODE" => Some(Constraint::Set(vec![0, 0x2000, 0x4000, 0x6000])), + "_CMP_" => Some(Constraint::Range(0..32)), + _ => None, + } +} \ No newline at end of file diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 84499b5c4b..ac613643bd 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -1,3 +1,4 @@ +mod constraint; mod intrinsic; mod types; mod xml_parser; diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index a6b4eb8382..55bc33ca7c 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -1,5 +1,12 @@ +use crate::common::argument::{Argument, ArgumentList}; +use crate::common::intrinsic::Intrinsic; +use crate::common::intrinsic_helpers::TypeKind; +use crate::x86::constraint::map_constraints; + use serde::{Deserialize, Deserializer}; +use std::path::Path; +use super::intrinsic::X86IntrinsicType; // Custom deserializer function to convert strings to u32 fn string_to_u32<'de, D>(deserializer: D) -> Result @@ -43,3 +50,66 @@ pub struct Parameter { #[serde(rename = "@immtype", default)] pub imm_type: String, } + +pub fn get_xml_intrinsics( + filename: &Path, +) -> Result>, Box> { + let file = std::fs::File::open(filename)?; + let reader = std::io::BufReader::new(file); + let data: Data = + quick_xml::de::from_reader(reader).expect("failed to deserialize the source XML file"); + + let parsed_intrinsics: Vec> = data + .intrinsics + .into_iter() + .filter_map(|intr| { + // Some(xml_to_intrinsic(intr, target).expect("Couldn't parse XML properly!")) + xml_to_intrinsic(intr).ok() + }) + .collect(); + + Ok(parsed_intrinsics) +} + +fn xml_to_intrinsic( + intr: XMLIntrinsic, +) -> Result, Box> { + let name = intr.name; + let result = X86IntrinsicType::from_param(&intr.return_data); + let args_check = intr.parameters.into_iter().enumerate().map(|(i, param)| { + let ty = X86IntrinsicType::from_param(¶m); + if ty.is_err() { + None + } else { + let constraint = map_constraints(¶m.imm_type); + let arg = Argument::::new( + i, + param.var_name.clone(), + ty.unwrap(), + constraint, + ); + Some(arg) + } + }); + + let args = args_check.collect::>(); + if args.iter().any(|elem| elem.is_none()) { + return Err(Box::from("intrinsic isn't fully supported in this test!")); + } + let args = args + .into_iter() + .map(|e| e.unwrap()) + .filter(|arg| arg.ty.ptr || arg.ty.kind != TypeKind::Void) + .collect::>(); + let arguments = ArgumentList:: { args }; + + if let Err(message) = result { + return Err(Box::from(message)); + } + Ok(Intrinsic { + name, + arguments, + results: result.unwrap(), + arch_tags: intr.cpuid, + }) +} \ No newline at end of file From 21798b1ce91b549d9901c4761f79b57a36bbd17e Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 3 Aug 2025 18:00:34 +0530 Subject: [PATCH 03/73] feat: updated intrinsics creation --- Cargo.lock | 53 ++++++++++++++++++++- crates/intrinsic-test/src/x86/mod.rs | 29 +++++++++-- crates/intrinsic-test/src/x86/xml_parser.rs | 4 +- 3 files changed, 80 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ff503f3035..97bdfd5368 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -347,8 +347,10 @@ dependencies = [ "itertools", "log", "pretty_env_logger", + "quick-xml 0.37.5", "rayon", "serde", + "serde-xml-rs", "serde_json", ] @@ -452,6 +454,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quickcheck" version = "1.0.3" @@ -587,6 +599,18 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-xml-rs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53630160a98edebde0123eb4dfd0fce6adff091b2305db3154a9e920206eb510" +dependencies = [ + "log", + "serde", + "thiserror", + "xml-rs", +] + [[package]] name = "serde_derive" version = "1.0.219" @@ -698,7 +722,7 @@ name = "stdarch-verify" version = "0.1.0" dependencies = [ "proc-macro2", - "quick-xml", + "quick-xml 0.33.0", "quote", "serde", "serde_json", @@ -746,6 +770,26 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "unicode-ident" version = "1.0.18" @@ -953,10 +997,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] +<<<<<<< HEAD name = "windows_x86_64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +======= +name = "xml-rs" +version = "0.8.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7" +>>>>>>> 3283a857 (feat: updated intrinsics creation) [[package]] name = "yaml-rust" diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index ac613643bd..d1c23b8a94 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -3,10 +3,12 @@ mod intrinsic; mod types; mod xml_parser; -use crate::common::SupportedArchitectureTest; use crate::common::cli::ProcessedCli; -use crate::common::intrinsic::Intrinsic; +use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; +use crate::common::intrinsic_helpers::TypeKind; +use crate::common::SupportedArchitectureTest; use intrinsic::X86IntrinsicType; +use xml_parser::get_xml_intrinsics; pub struct X86ArchitectureTest { intrinsics: Vec>, @@ -15,7 +17,28 @@ pub struct X86ArchitectureTest { impl SupportedArchitectureTest for X86ArchitectureTest { fn create(cli_options: ProcessedCli) -> Box { - todo!("create in X86ArchitectureTest is not implemented") + let intrinsics = + get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); + + let mut intrinsics = intrinsics + .into_iter() + // Not sure how we would compare intrinsic that returns void. + .filter(|i| i.results.kind() != TypeKind::Void) + .filter(|i| i.results.kind() != TypeKind::BFloat) + .filter(|i| i.arguments().args.len() > 0) + .filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat)) + // Skip pointers for now, we would probably need to look at the return + // type to work out how many elements we need to point to. + .filter(|i| !i.arguments.iter().any(|a| a.is_ptr())) + .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128)) + .filter(|i| !cli_options.skip.contains(&i.name)) + .collect::>(); + + intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); + Box::new(Self { + intrinsics: intrinsics, + cli_options: cli_options, + }) } fn build_c_file(&self) -> bool { diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 55bc33ca7c..7f76cbc40a 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -26,9 +26,9 @@ struct Data { #[derive(Deserialize)] struct XMLIntrinsic { #[serde(rename = "return")] - return_data: Parameter, + pub return_data: Parameter, #[serde(rename = "@name")] - name: String, + pub name: String, // #[serde(rename = "@tech")] // tech: String, #[serde(rename = "CPUID", default)] From bdc580139b1b12667740c23afe48842ed373d927 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 3 Aug 2025 18:25:44 +0530 Subject: [PATCH 04/73] feat: update building C code for x86 architecture. Notes: 1. chunk_info has been moved to `common/mod.rs` since it will be needed for all architectures --- crates/intrinsic-test/src/x86/compile.rs | 38 +++++++++++++ crates/intrinsic-test/src/x86/config.rs | 25 ++++++++ crates/intrinsic-test/src/x86/mod.rs | 72 +++++++++++++++++++++++- 3 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 crates/intrinsic-test/src/x86/compile.rs create mode 100644 crates/intrinsic-test/src/x86/config.rs diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs new file mode 100644 index 0000000000..170e40927f --- /dev/null +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -0,0 +1,38 @@ +use crate::common::cli::ProcessedCli; +use crate::common::compile_c::{CompilationCommandBuilder, CppCompilation}; + +pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { + let cpp_compiler = config.cpp_compiler.as_ref()?; + + // -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations + let mut command = CompilationCommandBuilder::new() + .add_arch_flags(vec![ + "avx", + "avx2", + "avx512f", + "avx512cd", + "avx512dq", + "avx512vl", + "avx512bw", + "avx512bf16", + "avx512bitalg", + "lzcnt", + "popcnt", + "adx", + "aes", + ]) + .set_compiler(cpp_compiler) + .set_target(&config.target) + .set_opt_level("2") + .set_cxx_toolchain_dir(config.cxx_toolchain_dir.as_deref()) + .set_project_root("c_programs") + .add_extra_flags(vec!["-ffp-contract=off", "-Wno-narrowing"]); + + if !cpp_compiler.contains("clang") { + command = command.add_extra_flag("-flax-vector-conversions"); + } + + let mut cpp_compiler = command.into_cpp_compilation(); + + Some(cpp_compiler) +} diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs new file mode 100644 index 0000000000..5a07dd745f --- /dev/null +++ b/crates/intrinsic-test/src/x86/config.rs @@ -0,0 +1,25 @@ +pub fn build_notices(line_prefix: &str) -> String { + format!( + "\ +{line_prefix}This is a transient test file, not intended for distribution. Some aspects of the +{line_prefix}test are derived from an XML specification, published under the same license as the +{line_prefix}`intrinsic-test` crate.\n +" + ) +} + +// Format f16 values (and vectors containing them) in a way that is consistent with C. +pub const F16_FORMATTING_DEF: &str = r#" +#[repr(transparent)] +struct Hex(T); + "#; + +pub const X86_CONFIGURATIONS: &str = r#" +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_bf16))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] +#![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] +#![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))] +#![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))] +#![feature(fmt_helpers_for_derive)] +"#; \ No newline at end of file diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index d1c23b8a94..c7b4a9d4aa 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -1,14 +1,21 @@ +mod compile; +mod config; mod constraint; mod intrinsic; mod types; mod xml_parser; +use std::fs::{self, File}; +use rayon::prelude::*; + use crate::common::cli::ProcessedCli; use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; use crate::common::intrinsic_helpers::TypeKind; -use crate::common::SupportedArchitectureTest; +use crate::common::{SupportedArchitectureTest, chunk_info}; +use crate::common::gen_c::{write_main_cpp, write_mod_cpp}; use intrinsic::X86IntrinsicType; use xml_parser::get_xml_intrinsics; +use config::build_notices; pub struct X86ArchitectureTest { intrinsics: Vec>, @@ -42,7 +49,68 @@ impl SupportedArchitectureTest for X86ArchitectureTest { } fn build_c_file(&self) -> bool { - todo!("build_c_file in X86ArchitectureTest is not implemented") + let c_target = "x86_64"; + let platform_headers = &["immintrin.h"]; + + let (chunk_size, chunk_count) = chunk_info(self.intrinsics.len()); + + let cpp_compiler_wrapped = compile::build_cpp_compilation(&self.cli_options); + + let notice = &build_notices("// "); + fs::create_dir_all("c_programs").unwrap(); + self.intrinsics + .par_chunks(chunk_size) + .enumerate() + .map(|(i, chunk)| { + let c_filename = format!("c_programs/mod_{i}.cpp"); + let mut file = File::create(&c_filename).unwrap(); + write_mod_cpp(&mut file, notice, c_target, platform_headers, chunk).unwrap(); + + // compile this cpp file into a .o file. + // + // This is done because `cpp_compiler_wrapped` is None when + // the --generate-only flag is passed + if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { + let output = cpp_compiler + .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?; + assert!(output.status.success(), "{output:?}"); + } + + Ok(()) + }) + .collect::>() + .unwrap(); + + let mut file = File::create("c_programs/main.cpp").unwrap(); + write_main_cpp( + &mut file, + c_target, + "\n", + self.intrinsics.iter().map(|i| i.name.as_str()), + ) + .unwrap(); + + // This is done because `cpp_compiler_wrapped` is None when + // the --generate-only flag is passed + if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { + // compile this cpp file into a .o file + info!("compiling main.cpp"); + let output = cpp_compiler + .compile_object_file("main.cpp", "intrinsic-test-programs.o") + .unwrap(); + assert!(output.status.success(), "{output:?}"); + + let object_files = (0..chunk_count) + .map(|i| format!("mod_{i}.o")) + .chain(["intrinsic-test-programs.o".to_owned()]); + + let output = cpp_compiler + .link_executable(object_files, "intrinsic-test-programs") + .unwrap(); + assert!(output.status.success(), "{output:?}"); + } + + true } fn build_rust_file(&self) -> bool { From adcd62969748dddae7999e9f153253487feb3583 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 3 Aug 2025 19:01:00 +0530 Subject: [PATCH 05/73] fix: code cleanup --- crates/intrinsic-test/src/x86/config.rs | 2 +- crates/intrinsic-test/src/x86/constraint.rs | 2 +- crates/intrinsic-test/src/x86/intrinsic.rs | 2 +- crates/intrinsic-test/src/x86/mod.rs | 8 +- crates/intrinsic-test/src/x86/types.rs | 230 +++++++++++++++++++- crates/intrinsic-test/src/x86/xml_parser.rs | 2 +- 6 files changed, 230 insertions(+), 16 deletions(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 5a07dd745f..427ec183a9 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -22,4 +22,4 @@ pub const X86_CONFIGURATIONS: &str = r#" #![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))] #![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))] #![feature(fmt_helpers_for_derive)] -"#; \ No newline at end of file +"#; diff --git a/crates/intrinsic-test/src/x86/constraint.rs b/crates/intrinsic-test/src/x86/constraint.rs index 321ee89b6c..1f06988388 100644 --- a/crates/intrinsic-test/src/x86/constraint.rs +++ b/crates/intrinsic-test/src/x86/constraint.rs @@ -17,4 +17,4 @@ pub fn map_constraints(imm_type: &String) -> Option { "_CMP_" => Some(Constraint::Range(0..32)), _ => None, } -} \ No newline at end of file +} diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index 27eca89406..1dedcb59a5 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -40,4 +40,4 @@ impl IntrinsicDefinition for Intrinsic { fn print_result_c(&self, _indentation: Indentation, _additional: &str) -> String { todo!("print_result_c in Intrinsic needs to be implemented!"); } -} \ No newline at end of file +} diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index c7b4a9d4aa..7f30a220f3 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -5,17 +5,17 @@ mod intrinsic; mod types; mod xml_parser; -use std::fs::{self, File}; use rayon::prelude::*; +use std::fs::{self, File}; use crate::common::cli::ProcessedCli; +use crate::common::gen_c::{write_main_cpp, write_mod_cpp}; use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; use crate::common::intrinsic_helpers::TypeKind; use crate::common::{SupportedArchitectureTest, chunk_info}; -use crate::common::gen_c::{write_main_cpp, write_mod_cpp}; +use config::build_notices; use intrinsic::X86IntrinsicType; use xml_parser::get_xml_intrinsics; -use config::build_notices; pub struct X86ArchitectureTest { intrinsics: Vec>, @@ -120,4 +120,4 @@ impl SupportedArchitectureTest for X86ArchitectureTest { fn compare_outputs(&self) -> bool { todo!("compare_outputs in X86ArchitectureTest is not implemented") } -} \ No newline at end of file +} diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 3dd205c9cb..e27a182370 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -1,23 +1,125 @@ +use std::collections::HashMap; +use std::str::FromStr; + +use itertools::Itertools; +use regex::Regex; + use super::intrinsic::X86IntrinsicType; use crate::common::cli::Language; -use crate::common::intrinsic_helpers::IntrinsicTypeDefinition; +use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind}; use crate::x86::xml_parser::Parameter; impl IntrinsicTypeDefinition for X86IntrinsicType { /// Gets a string containing the type in C format. /// This function assumes that this value is present in the metadata hashmap. fn c_type(&self) -> String { - todo!("c_type from IntrinsicTypeDefinition is not defined!") + self.metadata + .get("type") + .expect("Failed to extract the C typename in X86!") + .to_string() } fn c_single_vector_type(&self) -> String { // matches __m128, __m256 and similar types - todo!("c_type from IntrinsicTypeDefinition is not defined!") + let re = Regex::new(r"\__m\d+\").unwrap(); + match self.metadata.get("type") { + Some(type_data) if re.is_match(type_data) => type_data.to_string(), + _ => unreachable!("Shouldn't be called on this type"), + } } + // fn rust_type(&self) -> String { + // // handling edge cases first + // // the general handling is implemented below + // if let Some(val) = self.metadata.get("type") { + // match val.as_str() { + // "__m128 const *" => { + // return "&__m128".to_string(); + // } + // "__m128d const *" => { + // return "&__m128d".to_string(); + // } + // "const void*" => { + // return "&__m128d".to_string(); + // } + // _ => {} + // } + // } + + // if self.kind() == TypeKind::Void && self.ptr { + // // this has been handled by default settings in + // // the from_param function of X86IntrinsicType + // unreachable!() + // } + + // // general handling cases + // let core_part = if self.kind() == TypeKind::Mask { + // // all types of __mmask are handled here + // format!("__mask{}", self.bit_len.unwrap()) + // } else if self.simd_len.is_some() { + // // all types of __m vector types are handled here + // let re = Regex::new(r"\__m\d+[a-z]*").unwrap(); + // let rust_type = self + // .metadata + // .get("type") + // .map(|val| re.find(val).unwrap().as_str()); + // rust_type.unwrap().to_string() + // } else { + // format!( + // "{}{}", + // self.kind.rust_prefix().to_string(), + // self.bit_len.unwrap() + // ) + // }; + + // // extracting "memsize" so that even vector types can be involved + // let memwidth = self + // .metadata + // .get("memwidth") + // .map(|n| str::parse::(n).unwrap()); + // let prefix_part = if self.ptr && self.constant && self.bit_len.eq(&memwidth) { + // "&" + // } else if self.ptr && self.bit_len.eq(&memwidth) { + // "&mut " + // } else if self.ptr && self.constant { + // "*const " + // } else if self.ptr { + // "*mut " + // } else { + // "" + // }; + + // return prefix_part.to_string() + core_part.as_str(); + // } + /// Determines the load function for this type. fn get_load_function(&self, _language: Language) -> String { - todo!("get_load_function from IntrinsicTypeDefinition is not defined!") + if let Some(type_value) = self.metadata.get("type") { + if type_value.starts_with("__mmask") { + // no need of loads, since they work directly + // with hex constants + String::from("*") + } else if type_value.starts_with("__m") { + // the structure is like the follows: + // if "type" starts with __m{h/i/}, + // then use either _mm_set1_epi64, + // _mm256_set1_epi64 or _mm512_set1_epi64 + let type_val_filtered = type_value + .chars() + .filter(|c| c.is_numeric()) + .join("") + .replace("128", ""); + format!("_mm{type_val_filtered}_set1_epi64") + } else { + // if it is a pointer, then rely on type conversion + // If it is not any of the above type (__int, __bfloat16, unsigned short, etc) + // then typecast it. + format!("({type_value})") + } + // Look for edge cases (constexpr, literal, etc) + } else { + unimplemented!("the value for key 'type' is not present!"); + } } /// Determines the get lane function for this type. @@ -25,13 +127,125 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { todo!("get_lane_function for X86IntrinsicType needs to be implemented!"); } - fn from_c(s: &str, target: &str) -> Result { - todo!("from_c from IntrinsicTypeDefinition is not defined!") + fn from_c(s: &str) -> Result { + let mut s_copy = s.to_string(); + let mut metadata: HashMap = HashMap::new(); + metadata.insert("type".to_string(), s.to_string()); + s_copy = s_copy + .replace("*", "") + .replace("_", "") + .replace("constexpr", "") + .replace("const", "") + .replace("literal", ""); + + let s_split = s_copy + .split(" ") + .filter_map(|s| if s.len() == 0 { None } else { Some(s) }) + .last(); + + let s_split = s_split.map(|s| s.chars().filter(|c| !c.is_numeric()).join("")); + + // TODO: make the unwrapping safe + let kind = TypeKind::from_str(s_split.unwrap().trim()).unwrap_or(TypeKind::Void); + + let kind = if s.find("unsigned").is_some() { + match kind { + TypeKind::Int(_) => TypeKind::Int(Sign::Unsigned), + TypeKind::Char(_) => TypeKind::Char(Sign::Unsigned), + a => a, + } + } else { + kind + }; + + let ptr_constant = false; + let constant = s.matches("const").next().is_some(); + let ptr = s.matches("*").next().is_some(); + + Ok(X86IntrinsicType(IntrinsicType { + ptr, + ptr_constant, + constant, + kind, + bit_len: None, + simd_len: None, + vec_len: None, + metadata, + })) } } impl X86IntrinsicType { pub fn from_param(param: &Parameter) -> Result { - todo!("from_param from X86IntrinsicType is not defined!") + match Self::from_c(param.type_data.as_str()) { + Err(message) => Err(message), + Ok(mut ret) => { + // First correct the type of the parameter using param.etype. + // The assumption is that the parameter of type void may have param.type + // as "__m128i", "__mmask8" and the like. + ret.set_metadata("etype".to_string(), param.etype.clone()); + ret.set_metadata("memwidth".to_string(), param.memwidth.to_string()); + if !param.etype.is_empty() { + match TypeKind::from_str(param.etype.as_str()) { + Ok(value) => { + ret.kind = value; + } + Err(_) => {} + }; + } + + // check for param.etype. + // extract the numeric part and set as bit-len + // If param.etype is not present, guess the default bit-len + + let mut etype_processed = param.etype.clone(); + etype_processed.retain(|c| c.is_numeric()); + + match str::parse::(etype_processed.as_str()) { + Ok(value) => ret.bit_len = Some(value), + Err(_) => { + ret.bit_len = match ret.kind() { + TypeKind::Char(_) => Some(8), + TypeKind::BFloat => Some(16), + TypeKind::Int(_) => Some(32), + TypeKind::Float => Some(32), + _ => None, + }; + } + } + + // then check the param.type and extract numeric part if there are double + // underscores. divide this number with bit-len and set this as simd-len. + // Only __m types can have a simd-len. + if param.type_data.matches("__m").next().is_some() + && param.type_data.matches("__mmask").next().is_none() + { + let mut type_processed = param.type_data.clone(); + type_processed.retain(|c| c.is_numeric()); + ret.vec_len = match str::parse::(type_processed.as_str()) { + // If bit_len is None, vec_len will be None. + // Else vec_len will be (num_bits / bit_len). + Ok(num_bits) => ret.bit_len.and(Some(num_bits / ret.bit_len.unwrap())), + Err(_) => None, + }; + } + + // default settings for "void *" parameters + // often used by intrinsics to denote memory address or so. + if ret.kind == TypeKind::Void && ret.ptr { + ret.kind = TypeKind::Int(Sign::Unsigned); + ret.bit_len = Some(8); + } + + // if param.etype == IMM, then it is a constant. + // else it stays unchanged. + ret.constant |= param.etype == "IMM"; + + Ok(ret) + } + } + // Tile types won't currently reach here, since the intrinsic that involve them + // often return "null" type. Such intrinsics are not tested in `intrinsic-test` + // currently and are filtered out at `mod.rs`. } -} \ No newline at end of file +} diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 7f76cbc40a..b26e18840d 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -112,4 +112,4 @@ fn xml_to_intrinsic( results: result.unwrap(), arch_tags: intr.cpuid, }) -} \ No newline at end of file +} From 23b3ff9af04371900475371d239c50c777c4b6c5 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 15:46:01 +0530 Subject: [PATCH 06/73] chore: added Regex crate, updated the structure of X86IntrinsicType struct --- Cargo.lock | 1 + crates/intrinsic-test/Cargo.toml | 1 + crates/intrinsic-test/src/x86/compile.rs | 2 +- crates/intrinsic-test/src/x86/intrinsic.rs | 10 ++- crates/intrinsic-test/src/x86/types.rs | 98 ++++++++++----------- crates/intrinsic-test/src/x86/xml_parser.rs | 2 +- 6 files changed, 59 insertions(+), 55 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 97bdfd5368..26a4223271 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -349,6 +349,7 @@ dependencies = [ "pretty_env_logger", "quick-xml 0.37.5", "rayon", + "regex", "serde", "serde-xml-rs", "serde_json", diff --git a/crates/intrinsic-test/Cargo.toml b/crates/intrinsic-test/Cargo.toml index df4f7fe706..2c0f53897e 100644 --- a/crates/intrinsic-test/Cargo.toml +++ b/crates/intrinsic-test/Cargo.toml @@ -21,3 +21,4 @@ diff = "0.1.12" itertools = "0.14.0" quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] } serde-xml-rs = "0.8.0" +regex = "1.11.1" diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index 170e40927f..e8c2262b85 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -32,7 +32,7 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { command = command.add_extra_flag("-flax-vector-conversions"); } - let mut cpp_compiler = command.into_cpp_compilation(); + let cpp_compiler = command.into_cpp_compilation(); Some(cpp_compiler) } diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index 1dedcb59a5..0261a2df85 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -2,22 +2,26 @@ use crate::common::argument::ArgumentList; use crate::common::indentation::Indentation; use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; use crate::common::intrinsic_helpers::IntrinsicType; +use crate::x86::xml_parser::Parameter; use std::ops::{Deref, DerefMut}; #[derive(Debug, Clone, PartialEq)] -pub struct X86IntrinsicType(pub IntrinsicType); +pub struct X86IntrinsicType { + pub data: IntrinsicType, + pub param: Parameter, +} impl Deref for X86IntrinsicType { type Target = IntrinsicType; fn deref(&self) -> &Self::Target { - &self.0 + &self.data } } impl DerefMut for X86IntrinsicType { fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 + &mut self.data } } diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index e27a182370..542d1ad3fa 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -13,18 +13,16 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { /// Gets a string containing the type in C format. /// This function assumes that this value is present in the metadata hashmap. fn c_type(&self) -> String { - self.metadata - .get("type") - .expect("Failed to extract the C typename in X86!") - .to_string() + self.param.type_data.clone() } fn c_single_vector_type(&self) -> String { // matches __m128, __m256 and similar types let re = Regex::new(r"\__m\d+\").unwrap(); - match self.metadata.get("type") { - Some(type_data) if re.is_match(type_data) => type_data.to_string(), - _ => unreachable!("Shouldn't be called on this type"), + if re.is_match(self.param.type_data.as_str()) { + self.param.type_data.clone() + } else { + unreachable!("Shouldn't be called on this type") } } @@ -94,40 +92,42 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { /// Determines the load function for this type. fn get_load_function(&self, _language: Language) -> String { - if let Some(type_value) = self.metadata.get("type") { - if type_value.starts_with("__mmask") { - // no need of loads, since they work directly - // with hex constants - String::from("*") - } else if type_value.starts_with("__m") { - // the structure is like the follows: - // if "type" starts with __m{h/i/}, - // then use either _mm_set1_epi64, - // _mm256_set1_epi64 or _mm512_set1_epi64 - let type_val_filtered = type_value - .chars() - .filter(|c| c.is_numeric()) - .join("") - .replace("128", ""); - format!("_mm{type_val_filtered}_set1_epi64") - } else { - // if it is a pointer, then rely on type conversion - // If it is not any of the above type (__int, __bfloat16, unsigned short, etc) - // then typecast it. - format!("({type_value})") - } - // Look for edge cases (constexpr, literal, etc) - } else { + let type_value = self.param.type_data.clone(); + if type_value.len() == 0 { unimplemented!("the value for key 'type' is not present!"); } + if type_value.starts_with("__mmask") { + // no need of loads, since they work directly + // with hex constants + String::from("*") + } else if type_value.starts_with("__m") { + // the structure is like the follows: + // if "type" starts with __m{h/i/}, + // then use either _mm_set1_epi64, + // _mm256_set1_epi64 or _mm512_set1_epi64 + let type_val_filtered = type_value + .chars() + .filter(|c| c.is_numeric()) + .join("") + .replace("128", ""); + format!("_mm{type_val_filtered}_set1_epi64") + } else { + // if it is a pointer, then rely on type conversion + // If it is not any of the above type (__int, __bfloat16, unsigned short, etc) + // then typecast it. + format!("({type_value})") + } + // Look for edge cases (constexpr, literal, etc) } /// Determines the get lane function for this type. fn get_lane_function(&self) -> String { todo!("get_lane_function for X86IntrinsicType needs to be implemented!"); } +} - fn from_c(s: &str) -> Result { +impl X86IntrinsicType { + fn from_c(s: &str) -> Result { let mut s_copy = s.to_string(); let mut metadata: HashMap = HashMap::new(); metadata.insert("type".to_string(), s.to_string()); @@ -162,7 +162,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { let constant = s.matches("const").next().is_some(); let ptr = s.matches("*").next().is_some(); - Ok(X86IntrinsicType(IntrinsicType { + Ok(IntrinsicType { ptr, ptr_constant, constant, @@ -170,25 +170,20 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { bit_len: None, simd_len: None, vec_len: None, - metadata, - })) + }) } -} -impl X86IntrinsicType { pub fn from_param(param: &Parameter) -> Result { match Self::from_c(param.type_data.as_str()) { Err(message) => Err(message), - Ok(mut ret) => { + Ok(mut data) => { // First correct the type of the parameter using param.etype. // The assumption is that the parameter of type void may have param.type // as "__m128i", "__mmask8" and the like. - ret.set_metadata("etype".to_string(), param.etype.clone()); - ret.set_metadata("memwidth".to_string(), param.memwidth.to_string()); if !param.etype.is_empty() { match TypeKind::from_str(param.etype.as_str()) { Ok(value) => { - ret.kind = value; + data.kind = value; } Err(_) => {} }; @@ -202,9 +197,9 @@ impl X86IntrinsicType { etype_processed.retain(|c| c.is_numeric()); match str::parse::(etype_processed.as_str()) { - Ok(value) => ret.bit_len = Some(value), + Ok(value) => data.bit_len = Some(value), Err(_) => { - ret.bit_len = match ret.kind() { + data.bit_len = match data.kind() { TypeKind::Char(_) => Some(8), TypeKind::BFloat => Some(16), TypeKind::Int(_) => Some(32), @@ -222,26 +217,29 @@ impl X86IntrinsicType { { let mut type_processed = param.type_data.clone(); type_processed.retain(|c| c.is_numeric()); - ret.vec_len = match str::parse::(type_processed.as_str()) { + data.vec_len = match str::parse::(type_processed.as_str()) { // If bit_len is None, vec_len will be None. // Else vec_len will be (num_bits / bit_len). - Ok(num_bits) => ret.bit_len.and(Some(num_bits / ret.bit_len.unwrap())), + Ok(num_bits) => data.bit_len.and(Some(num_bits / data.bit_len.unwrap())), Err(_) => None, }; } // default settings for "void *" parameters // often used by intrinsics to denote memory address or so. - if ret.kind == TypeKind::Void && ret.ptr { - ret.kind = TypeKind::Int(Sign::Unsigned); - ret.bit_len = Some(8); + if data.kind == TypeKind::Void && data.ptr { + data.kind = TypeKind::Int(Sign::Unsigned); + data.bit_len = Some(8); } // if param.etype == IMM, then it is a constant. // else it stays unchanged. - ret.constant |= param.etype == "IMM"; + data.constant |= param.etype == "IMM"; - Ok(ret) + Ok(X86IntrinsicType { + data, + param: param.clone(), + }) } } // Tile types won't currently reach here, since the intrinsic that involve them diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index b26e18840d..0b422bddb5 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -37,7 +37,7 @@ struct XMLIntrinsic { parameters: Vec, } -#[derive(Deserialize)] +#[derive(Debug, PartialEq, Clone, Deserialize)] pub struct Parameter { #[serde(rename = "@varname")] pub var_name: String, From b3292c3bc5d77ced3014f49763f9edd80349373d Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 15:56:09 +0530 Subject: [PATCH 07/73] feat: implemented build_rust_file of `x86` module --- crates/intrinsic-test/src/x86/mod.rs | 59 +++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 7f30a220f3..9030205c78 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -10,9 +10,13 @@ use std::fs::{self, File}; use crate::common::cli::ProcessedCli; use crate::common::gen_c::{write_main_cpp, write_mod_cpp}; +use crate::common::gen_rust::{ + compile_rust_programs, write_bin_cargo_toml, write_lib_cargo_toml, write_lib_rs, write_main_rs, +}; use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; use crate::common::intrinsic_helpers::TypeKind; use crate::common::{SupportedArchitectureTest, chunk_info}; +use crate::x86::config::{F16_FORMATTING_DEF, X86_CONFIGURATIONS}; use config::build_notices; use intrinsic::X86IntrinsicType; use xml_parser::get_xml_intrinsics; @@ -114,7 +118,60 @@ impl SupportedArchitectureTest for X86ArchitectureTest { } fn build_rust_file(&self) -> bool { - todo!("build_rust_file in X86ArchitectureTest is not implemented") + std::fs::create_dir_all("rust_programs/src").unwrap(); + + let architecture = if self.cli_options.target.contains("v7") { + "arm" + } else { + "aarch64" + }; + + let (chunk_size, chunk_count) = chunk_info(self.intrinsics.len()); + + let mut cargo = File::create("rust_programs/Cargo.toml").unwrap(); + write_bin_cargo_toml(&mut cargo, chunk_count).unwrap(); + + let mut main_rs = File::create("rust_programs/src/main.rs").unwrap(); + write_main_rs( + &mut main_rs, + chunk_count, + X86_CONFIGURATIONS, + "", + self.intrinsics.iter().map(|i| i.name.as_str()), + ) + .unwrap(); + + let target = &self.cli_options.target; + let toolchain = self.cli_options.toolchain.as_deref(); + let linker = self.cli_options.linker.as_deref(); + + let notice = &build_notices("// "); + self.intrinsics + .par_chunks(chunk_size) + .enumerate() + .map(|(i, chunk)| { + std::fs::create_dir_all(format!("rust_programs/mod_{i}/src"))?; + + let rust_filename = format!("rust_programs/mod_{i}/src/lib.rs"); + trace!("generating `{rust_filename}`"); + let mut file = File::create(rust_filename)?; + + let cfg = X86_CONFIGURATIONS; + let definitions = F16_FORMATTING_DEF; + write_lib_rs(&mut file, architecture, notice, cfg, definitions, chunk)?; + + let toml_filename = format!("rust_programs/mod_{i}/Cargo.toml"); + trace!("generating `{toml_filename}`"); + let mut file = File::create(toml_filename).unwrap(); + + write_lib_cargo_toml(&mut file, &format!("mod_{i}"))?; + + Ok(()) + }) + .collect::>() + .unwrap(); + + compile_rust_programs(toolchain, target, linker) } fn compare_outputs(&self) -> bool { From 75ff3134ab322cea06fe1f9a17d0c655dd2bbb11 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 15:57:55 +0530 Subject: [PATCH 08/73] feat: implemented compare_outputs of `x86` module --- crates/intrinsic-test/src/x86/mod.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 9030205c78..5515e68385 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -9,6 +9,7 @@ use rayon::prelude::*; use std::fs::{self, File}; use crate::common::cli::ProcessedCli; +use crate::common::compare::compare_outputs; use crate::common::gen_c::{write_main_cpp, write_mod_cpp}; use crate::common::gen_rust::{ compile_rust_programs, write_bin_cargo_toml, write_lib_cargo_toml, write_lib_rs, write_main_rs, @@ -175,6 +176,20 @@ impl SupportedArchitectureTest for X86ArchitectureTest { } fn compare_outputs(&self) -> bool { - todo!("compare_outputs in X86ArchitectureTest is not implemented") + if self.cli_options.toolchain.is_some() { + let intrinsics_name_list = self + .intrinsics + .iter() + .map(|i| i.name.clone()) + .collect::>(); + + compare_outputs( + &intrinsics_name_list, + &self.cli_options.runner, + &self.cli_options.target, + ) + } else { + true + } } } From 1c82e7f5d59464d9335bd0b4174a716981c1f8eb Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 16:16:26 +0530 Subject: [PATCH 09/73] feat: implement `print_result_c` for `Intrinsic` --- crates/intrinsic-test/src/x86/intrinsic.rs | 68 +++++++++++++++++++++- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index 0261a2df85..169394b793 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -1,7 +1,7 @@ use crate::common::argument::ArgumentList; use crate::common::indentation::Indentation; use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; -use crate::common::intrinsic_helpers::IntrinsicType; +use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, TypeKind}; use crate::x86::xml_parser::Parameter; use std::ops::{Deref, DerefMut}; @@ -41,7 +41,69 @@ impl IntrinsicDefinition for Intrinsic { /// Generates a std::cout for the intrinsics results that will match the /// rust debug output format for the return type. The generated line assumes /// there is an int i in scope which is the current pass number. - fn print_result_c(&self, _indentation: Indentation, _additional: &str) -> String { - todo!("print_result_c in Intrinsic needs to be implemented!"); + fn print_result_c(&self, indentation: Indentation, additional: &str) -> String { + let lanes = if self.results().num_vectors() > 1 { + (0..self.results().num_vectors()) + .map(|vector| { + format!( + r#""{ty}(" << {lanes} << ")""#, + ty = self.results().c_single_vector_type(), + lanes = (0..self.results().num_lanes()) + .map(move |idx| -> std::string::String { + format!( + "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", + cast = self.results().c_promotion(), + lane_fn = self.results().get_lane_function(), + lane = idx, + vector = vector, + ) + }) + .collect::>() + .join(r#" << ", " << "#) + ) + }) + .collect::>() + .join(r#" << ", " << "#) + } else if self.results().num_lanes() > 1 { + (0..self.results().num_lanes()) + .map(|idx| -> std::string::String { + format!( + "{cast}{lane_fn}(__return_value, {lane})", + cast = self.results().c_promotion(), + lane_fn = self.results().get_lane_function(), + lane = idx + ) + }) + .collect::>() + .join(r#" << ", " << "#) + } else { + format!( + "{promote}cast<{cast}>(__return_value)", + cast = match self.results.kind() { + TypeKind::Void => "void".to_string(), + TypeKind::Float if self.results().inner_size() == 64 => "double".to_string(), + TypeKind::Float if self.results().inner_size() == 32 => "float".to_string(), + // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), + // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), + // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), + // TypeKind::Int(true) if self.results().inner_size() == 32 => "int".to_string(), + // TypeKind::Int(false) if self.results().inner_size() == 32 => "unsigned int".to_string(), + // TypeKind::Int(true) if self.results().inner_size() == 16 => "short".to_string(), + // TypeKind::Int(false) if self.results().inner_size() == 16 => "unsigned short".to_string(), + _ => self.results.c_scalar_type(), + }, + promote = self.results().c_promotion(), + ) + }; + + format!( + r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#, + ty = if self.results().is_simd() { + format!("{}(", self.results().c_type()) + } else { + String::from("") + }, + close = if self.results.is_simd() { ")" } else { "" }, + ) } } From 3fddae0889000f1d993c3059c8423fc72ae37cac Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 21:45:22 +0530 Subject: [PATCH 10/73] feat: Added x86 to CI pipeline --- ci/run.sh | 3 + intrinsics_data/x86-intel.xml | 158422 +++++++++++++++++++++++++++++++ 2 files changed, 158425 insertions(+) create mode 100644 intrinsics_data/x86-intel.xml diff --git a/ci/run.sh b/ci/run.sh index 2bb77bae25..d8af9b7697 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -90,6 +90,9 @@ fi # Test targets compiled with extra features. case ${TARGET} in x86_64-unknown-linux-gnu) + TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/" + TEST_CXX_COMPILER="clang++-19" + TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" export STDARCH_DISABLE_ASSERT_INSTR=1 export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" diff --git a/intrinsics_data/x86-intel.xml b/intrinsics_data/x86-intel.xml new file mode 100644 index 0000000000..41f2119e68 --- /dev/null +++ b/intrinsics_data/x86-intel.xml @@ -0,0 +1,158422 @@ + + + + + + + + Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). + +tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) +MEM[out+31:out] := tmp[31:0] +dst[0] := tmp[32] +dst[7:1] := 0 + + + + ADX +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). + +tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) +MEM[out+63:out] := tmp[63:0] +dst[0] := tmp[64] +dst[7:1] := 0 + + + + ADX +
immintrin.h
+ Arithmetic +
+ + + + + Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." + a[127:0] := ShiftRows(a[127:0]) +a[127:0] := SubBytes(a[127:0]) +a[127:0] := MixColumns(a[127:0]) +dst[127:0] := a[127:0] XOR RoundKey[127:0] + + + AES +
wmmintrin.h
+ Cryptography +
+ + + + + Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." + a[127:0] := ShiftRows(a[127:0]) +a[127:0] := SubBytes(a[127:0]) +dst[127:0] := a[127:0] XOR RoundKey[127:0] + + + AES +
wmmintrin.h
+ Cryptography +
+ + + + + Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". + a[127:0] := InvShiftRows(a[127:0]) +a[127:0] := InvSubBytes(a[127:0]) +a[127:0] := InvMixColumns(a[127:0]) +dst[127:0] := a[127:0] XOR RoundKey[127:0] + + + AES +
wmmintrin.h
+ Cryptography +
+ + + + + Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". + a[127:0] := InvShiftRows(a[127:0]) +a[127:0] := InvSubBytes(a[127:0]) +dst[127:0] := a[127:0] XOR RoundKey[127:0] + + + AES +
wmmintrin.h
+ Cryptography +
+ + + + Perform the InvMixColumns transformation on "a" and store the result in "dst". + dst[127:0] := InvMixColumns(a[127:0]) + + + AES +
wmmintrin.h
+ Cryptography +
+ + + + + Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"." + X3[31:0] := a[127:96] +X2[31:0] := a[95:64] +X1[31:0] := a[63:32] +X0[31:0] := a[31:0] +RCON[31:0] := ZeroExtend32(imm8[7:0]) +dst[31:0] := SubWord(X1) +dst[63:32] := RotWord(SubWord(X1)) XOR RCON +dst[95:64] := SubWord(X3) +dst[127:96] := RotWord(SubWord(X3)) XOR RCON + + + AES +
wmmintrin.h
+ Cryptography +
+ + + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(a.row[m].bf16[2*k+0]) * FP32(b.row[k].bf16[2*n+0]) + tmp.fp32[n] += FP32(a.row[m].bf16[2*k+1]) * FP32(b.row[k].bf16[2*n+1]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-BF16 +
immintrin.h
+ Application-Targeted +
+ + + Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "src0" and "src1", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(src0.row[m].bf16[2*k+0]) * FP32(src1.row[k].bf16[2*n+0]) + tmp.fp32[n] += FP32(src0.row[m].bf16[2*k+1]) * FP32(src1.row[k].bf16[2*n+1]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-BF16 +
immintrin.h
+ Application-Targeted +
+ + + + + + + Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "a" and "b" is interpreted as a complex number with FP16 real part and FP16 imaginary part. Calculates the imaginary part of the result. For each possible combination of (row of "a", column of "b"), it performs a set of multiplication and accumulations on all corresponding complex numbers (one from "a" and one from "b"). The imaginary part of the "a" element is multiplied with the real part of the corresponding "b" element, and the real part of the "a" element is multiplied with the imaginary part of the corresponding "b" elements. The two accumulated results are added, and then accumulated into the corresponding row and column of "dst". + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) + tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-COMPLEX +
immintrin.h
+ Application-Targeted +
+ + + + + + Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "a" and "b" is interpreted as a complex number with FP16 real part and FP16 imaginary part. Calculates the real part of the result. For each possible combination of (row of "a", column of "b"), it performs a set of multiplication and accumulations on all corresponding complex numbers (one from "a" and one from "b"). The real part of the "a" element is multiplied with the real part of the corresponding "b" element, and the negated imaginary part of the "a" element is multiplied with the imaginary part of the corresponding "b" elements. The two accumulated results are added, and then accumulated into the corresponding row and column of "dst". + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) + tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-COMPLEX +
immintrin.h
+ Application-Targeted +
+ + + Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "src0" and "src1" is interpreted as a complex number with FP16 real part and FP16 imaginary part. This function calculates the imaginary part of the result. + + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+1]) + tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+0]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-COMPLEX +
immintrin.h
+ Application-Targeted +
+ + + Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles src0 and src1 is interpreted as a complex number with FP16 real part and FP16 imaginary part. This function calculates the real part of the result. + + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+0]) + tmp.fp32[n] += FP32(-src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+1]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-COMPLEX +
immintrin.h
+ Application-Targeted +
+ + + + + + + Compute dot-product of FP16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) + tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-FP16 +
immintrin.h
+ Application-Targeted +
+ + + Compute dot-product of FP16 (16-bit) floating-point pairs in tiles "src0" and "src1", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+0]) + tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+1]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-FP16 +
immintrin.h
+ Application-Targeted +
+ + + + + + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". + DEFINE DPBD(c, x, y) { + tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) + tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) + tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) + tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) + + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + + + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". + DEFINE DPBD(c, x, y) { + tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0]) + tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1]) + tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2]) + tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3]) + + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + + + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". + DEFINE DPBD(c, x, y) { + tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) + tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) + tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) + tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) + + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + + + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". + DEFINE DPBD(c, x, y) { + tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0]) + tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1]) + tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2]) + tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3]) + + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (a.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "src0" with corresponding signed 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + DEFINE DPBD(c, x, y) { + tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0]) + tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1]) + tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2]) + tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3]) + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "src0" with corresponding unsigned 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + DEFINE DPBD(c, x, y) { + tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) + tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) + tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) + tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "src0" with corresponding signed 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + DEFINE DPBD(c, x, y) { + tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0]) + tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1]) + tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2]) + tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3]) + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "src0" with corresponding unsigned 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + DEFINE DPBD(c, x, y) { + tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) + tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) + tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) + tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) + RETURN c + tmp1 + tmp2 + tmp3 + tmp4 +} +FOR m := 0 TO dst.rows - 1 + tmp := dst.row[m] + FOR k := 0 TO (src0.colsb / 4) - 1 + FOR n := 0 TO (dst.colsb / 4) - 1 + tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) + ENDFOR + ENDFOR + write_row_and_zero(dst, m, tmp, dst.colsb) +ENDFOR +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-INT8 +
immintrin.h
+ Application-Targeted +
+ + + + + Load tile configuration from a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If the specified pallette_id is zero, that signifies the init state for both the tile config and the tile data, and the tiles are zeroed. Any invalid configurations will result in #GP fault. + +// format of memory payload. each field is a byte. +// 0: palette +// 1: start_row +// 2-15: reserved, must be zero +// 16-17: tile0.colsb +// 18-19: tile1.colsb +// 20-21: tile2.colsb +// ... +// 30-31: tile7.colsb +// 32-47: reserved, must be zero +// 48: tile0.rows +// 49: tile1.rows +// 50: tile2.rows +// ... +// 55: tile7.rows +// 56-63: reserved, must be zero + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + + Stores the current tile configuration to a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory. + +// format of memory payload. each field is a byte. +// 0: palette +// 1: start_row +// 2-15: reserved, must be zero +// 16-17: tile0.colsb +// 18-19: tile1.colsb +// 20-21: tile2.colsb +// ... +// 30-31: tile7.colsb +// 32-47: reserved, must be zero +// 48: tile0.rows +// 49: tile1.rows +// 50: tile2.rows +// ... +// 55: tile7.rows +// 56-63: reserved, must be zero + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + + + + Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". + start := tileconfig.startRow +IF start == 0 // not restarting, zero incoming state + tilezero(dst) +FI +nbytes := dst.colsb +DO WHILE start < dst.rows + memptr := base + start * stride + write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) + start := start + 1 +OD +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + + + + Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly. + start := tileconfig.startRow +IF start == 0 // not restarting, zero incoming state + tilezero(dst) +FI +nbytes := dst.colsb +DO WHILE start < dst.rows + memptr := base + start * stride + write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) + start := start + 1 +OD +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + Release the tile configuration to return to the init state, which releases all storage it currently holds. + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + + + + Store the tile specified by "src" to memory specifieid by "base" address and "stride" using the tile configuration previously configured via "_tile_loadconfig". + start := tileconfig.startRow +DO WHILE start < src.rows + memptr := base + start * stride + write_memory(memptr, src.colsb, src.row[start]) + start := start + 1 +OD +zero_tileconfig_start() + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + + Zero the tile specified by "tdest". + nbytes := palette_table[tileconfig.palette_id].bytes_per_row +FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1 + FOR j := 0 TO nbytes-1 + tdest.row[i].byte[j] := 0 + ENDFOR +ENDFOR + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + start := tileconfig.startRow +IF start == 0 // not restarting, zero incoming state + tilezero(dst) +FI +nbytes := dst.colsb +DO WHILE start < dst.rows + memptr := base + start * stride + write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) + start := start + 1 +OD +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + Store the tile specified by "src" to memory specifieid by "base" address and "stride". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + start := tileconfig.startRow +DO WHILE start < src.rows + memptr := base + start * stride + write_memory(memptr, src.colsb, src.row[start]) + start := start + 1 +OD +zero_tileconfig_start() + + + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly. The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + start := tileconfig.startRow +IF start == 0 // not restarting, zero incoming state + tilezero(dst) +FI +nbytes := dst.colsb +DO WHILE start < dst.rows + memptr := base + start * stride + write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) + start := start + 1 +OD +zero_upper_rows(dst, dst.rows) +zero_tileconfig_start() + + + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + Zero the tile specified by "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. + + nbytes := palette_table[tileconfig.palette_id].bytes_per_row +FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1 + FOR j := 0 TO nbytes-1 + tdest.row[i].byte[j] := 0 + ENDFOR +ENDFOR + + + AMX-TILE +
immintrin.h
+ Application-Targeted +
+ + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ACOS(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ACOS(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ACOSH(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ACOSH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ASIN(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ASIN(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ASINH(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ASINH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ATAN(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ATAN(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ATANH(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ATANH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := COSD(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := COSD(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := COSH(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := COSH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SIND(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SIND(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SINH(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SINH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := TAN(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := TAN(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := TAND(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := TAND(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := TANH(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := TANH(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Trigonometry +
+ + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CubeRoot(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CEXP(a[31:0], b[31:0]) { + result[31:0] := POW(FP32(e), a[31:0]) * COS(b[31:0]) + result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0]) + RETURN result +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CLOG(a[31:0], b[31:0]) { + result[31:0] := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0))) + result[63:32] := ATAN2(b, a) + RETURN result +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CSQRT(a[31:0], b[31:0]) { + sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) + result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + RETURN result +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(10.0, a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(2.0, a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := InvCubeRoot(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := InvCubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := InvSQRT(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := InvSQRT(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LOG(1.0 + a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(1.0 + a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ERF(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ERF(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*32 + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Probability/Statistics +
+ + + + + Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 31 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 7 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 3 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 31 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 7 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 3 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 31 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 15 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 3 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 31 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 15 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 3 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Arithmetic +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := TRUNCATE(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Miscellaneous +
+ + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := TRUNCATE(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Miscellaneous +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + b[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + b[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + dst[i+63:i] := a[i+63:i] / b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := a[i+31:i] / b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + + Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". + +DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { + FOR j := 0 to 3 + i := j*32 + IF imm8[(4+j)%8] + temp[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + temp[i+31:i] := FP32(0.0) + FI + ENDFOR + + sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) + + FOR j := 0 to 3 + i := j*32 + IF imm8[j%8] + tmpdst[i+31:i] := sum[31:0] + ELSE + tmpdst[i+31:i] := FP32(0.0) + FI + ENDFOR + RETURN tmpdst[127:0] +} +dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) +dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[63:0] := a[127:64] + a[63:0] +dst[127:64] := b[127:64] + b[63:0] +dst[191:128] := a[255:192] + a[191:128] +dst[255:192] := b[255:192] + b[191:128] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[31:0] := a[63:32] + a[31:0] +dst[63:32] := a[127:96] + a[95:64] +dst[95:64] := b[63:32] + b[31:0] +dst[127:96] := b[127:96] + b[95:64] +dst[159:128] := a[191:160] + a[159:128] +dst[191:160] := a[255:224] + a[223:192] +dst[223:192] := b[191:160] + b[159:128] +dst[255:224] := b[255:224] + b[223:192] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[63:0] := a[63:0] - a[127:64] +dst[127:64] := b[63:0] - b[127:64] +dst[191:128] := a[191:128] - a[255:192] +dst[255:192] := b[191:128] - b[255:192] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[31:0] := a[31:0] - a[63:32] +dst[63:32] := a[95:64] - a[127:96] +dst[95:64] := b[31:0] - b[63:32] +dst[127:96] := b[95:64] - b[127:96] +dst[159:128] := a[159:128] - a[191:160] +dst[191:160] := a[223:192] - a[255:224] +dst[223:192] := b[159:128] - b[191:160] +dst[255:224] := b[223:192] - b[255:224] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Arithmetic +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. + +IF ((a[255:0] AND b[255:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[255:0]) AND b[255:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +RETURN ZF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. + +IF ((a[255:0] AND b[255:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[255:0]) AND b[255:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +RETURN CF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +IF ((a[255:0] AND b[255:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[255:0]) AND b[255:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. + +tmp[255:0] := a[255:0] AND b[255:0] +IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[255:0] := (NOT a[255:0]) AND b[255:0] +IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := ZF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. + +tmp[255:0] := a[255:0] AND b[255:0] +IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[255:0] := (NOT a[255:0]) AND b[255:0] +IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := CF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +tmp[255:0] := a[255:0] AND b[255:0] +IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[255:0] := (NOT a[255:0]) AND b[255:0] +IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. + +tmp[127:0] := a[127:0] AND b[127:0] +IF (tmp[63] == 0 && tmp[127] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[127:0] := (NOT a[127:0]) AND b[127:0] +IF (tmp[63] == 0 && tmp[127] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := ZF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. + +tmp[127:0] := a[127:0] AND b[127:0] +IF (tmp[63] == 0 && tmp[127] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[127:0] := (NOT a[127:0]) AND b[127:0] +IF (tmp[63] == 0 && tmp[127] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := CF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +tmp[127:0] := a[127:0] AND b[127:0] +IF (tmp[63] == 0 && tmp[127] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[127:0] := (NOT a[127:0]) AND b[127:0] +IF (tmp[63] == 0 && tmp[127] == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. + +tmp[255:0] := a[255:0] AND b[255:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ + tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[255:0] := (NOT a[255:0]) AND b[255:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ + tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := ZF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. + +tmp[255:0] := a[255:0] AND b[255:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ + tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[255:0] := (NOT a[255:0]) AND b[255:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ + tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := CF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +tmp[255:0] := a[255:0] AND b[255:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ + tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[255:0] := (NOT a[255:0]) AND b[255:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ + tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. + +tmp[127:0] := a[127:0] AND b[127:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[127:0] := (NOT a[127:0]) AND b[127:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := ZF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. + +tmp[127:0] := a[127:0] AND b[127:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[127:0] := (NOT a[127:0]) AND b[127:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := CF + + + AVX +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +tmp[127:0] := a[127:0] AND b[127:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) + ZF := 1 +ELSE + ZF := 0 +FI +tmp[127:0] := (NOT a[127:0]) AND b[127:0] +IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + AVX +
immintrin.h
+ Logical +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF imm8[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF imm8[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". + +dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Extract a 32-bit integer from "a", selected with "index", and store the result in "dst". + +dst[31:0] := (a[255:0] >> (index[2:0] * 32))[31:0] + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Extract a 64-bit integer from "a", selected with "index", and store the result in "dst". + +dst[63:0] := (a[255:0] >> (index[1:0] * 64))[63:0] + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], b[1:0]) +dst[63:32] := SELECT4(a[127:0], b[33:32]) +dst[95:64] := SELECT4(a[127:0], b[65:64]) +dst[127:96] := SELECT4(a[127:0], b[97:96]) +dst[159:128] := SELECT4(a[255:128], b[129:128]) +dst[191:160] := SELECT4(a[255:128], b[161:160]) +dst[223:192] := SELECT4(a[255:128], b[193:192]) +dst[255:224] := SELECT4(a[255:128], b[225:224]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], b[1:0]) +dst[63:32] := SELECT4(a[127:0], b[33:32]) +dst[95:64] := SELECT4(a[127:0], b[65:64]) +dst[127:96] := SELECT4(a[127:0], b[97:96]) +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + +IF (b[1] == 0) dst[63:0] := a[63:0]; FI +IF (b[1] == 1) dst[63:0] := a[127:64]; FI +IF (b[65] == 0) dst[127:64] := a[63:0]; FI +IF (b[65] == 1) dst[127:64] := a[127:64]; FI +IF (b[129] == 0) dst[191:128] := a[191:128]; FI +IF (b[129] == 1) dst[191:128] := a[255:192]; FI +IF (b[193] == 0) dst[255:192] := a[191:128]; FI +IF (b[193] == 1) dst[255:192] := a[255:192]; FI +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". + +IF (b[1] == 0) dst[63:0] := a[63:0]; FI +IF (b[1] == 1) dst[63:0] := a[127:64]; FI +IF (b[65] == 0) dst[127:64] := a[63:0]; FI +IF (b[65] == 1) dst[127:64] := a[127:64]; FI +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". + +IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE imm8[0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index". + +dst[255:0] := a[255:0] +sel := index[4:0]*8 +dst[sel+7:sel] := i[7:0] + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index". + +dst[255:0] := a[255:0] +sel := index[3:0]*16 +dst[sel+15:sel] := i[15:0] + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index". + +dst[255:0] := a[255:0] +sel := index[2:0]*32 +dst[sel+31:sel] := i[31:0] + + AVX +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index". + +dst[255:0] := a[255:0] +sel := index[1:0]*64 +dst[sel+63:sel] := i[63:0] + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Swizzle +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + + Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". + [round_note] + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i], rounding) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + + Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i], rounding) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Compare +
+ + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Compare +
+ + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Compare +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Convert +
+ + + + Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". + +dst[31:0] := a[31:0] + + + AVX +
immintrin.h
+ Convert +
+ + + + Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". + +dst[63:0] := a[63:0] + + + AVX +
immintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + + AVX +
immintrin.h
+ Convert +
+ + + + Zero the contents of all XMM or YMM registers. + YMM0[MAX:0] := 0 +YMM1[MAX:0] := 0 +YMM2[MAX:0] := 0 +YMM3[MAX:0] := 0 +YMM4[MAX:0] := 0 +YMM5[MAX:0] := 0 +YMM6[MAX:0] := 0 +YMM7[MAX:0] := 0 +IF _64_BIT_MODE + YMM8[MAX:0] := 0 + YMM9[MAX:0] := 0 + YMM10[MAX:0] := 0 + YMM11[MAX:0] := 0 + YMM12[MAX:0] := 0 + YMM13[MAX:0] := 0 + YMM14[MAX:0] := 0 + YMM15[MAX:0] := 0 +FI + + + AVX +
immintrin.h
+ General Support +
+ + + + Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified. + YMM0[MAX:128] := 0 +YMM1[MAX:128] := 0 +YMM2[MAX:128] := 0 +YMM3[MAX:128] := 0 +YMM4[MAX:128] := 0 +YMM5[MAX:128] := 0 +YMM6[MAX:128] := 0 +YMM7[MAX:128] := 0 +IF _64_BIT_MODE + YMM8[MAX:128] := 0 + YMM9[MAX:128] := 0 + YMM10[MAX:128] := 0 + YMM11[MAX:128] := 0 + YMM12[MAX:128] := 0 + YMM13[MAX:128] := 0 + YMM14[MAX:128] := 0 + YMM15[MAX:128] := 0 +FI + + + AVX +
immintrin.h
+ General Support +
+ + + + Return vector of type __m256 with undefined elements. + AVX +
immintrin.h
+ General Support +
+ + + + Return vector of type __m256d with undefined elements. + AVX +
immintrin.h
+ General Support +
+ + + + Return vector of type __m256i with undefined elements. + AVX +
immintrin.h
+ General Support +
+ + + + Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". + +tmp[31:0] := MEM[mem_addr+31:mem_addr] +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := tmp[31:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". + +tmp[31:0] := MEM[mem_addr+31:mem_addr] +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := tmp[31:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst". + +tmp[63:0] := MEM[mem_addr+63:mem_addr] +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := tmp[63:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst". + +tmp[127:0] := MEM[mem_addr+127:mem_addr] +dst[127:0] := tmp[127:0] +dst[255:128] := tmp[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + Swizzle + + + Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst". + +tmp[127:0] := MEM[mem_addr+127:mem_addr] +dst[127:0] := tmp[127:0] +dst[255:128] := tmp[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Load +
+ + + + + Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +dst[127:0] := MEM[loaddr+127:loaddr] +dst[255:128] := MEM[hiaddr+127:hiaddr] +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Load +
+ + + + + Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +dst[127:0] := MEM[loaddr+127:loaddr] +dst[255:128] := MEM[hiaddr+127:hiaddr] +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Load +
+ + + + + Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst". + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +dst[127:0] := MEM[loaddr+127:loaddr] +dst[255:128] := MEM[hiaddr+127:hiaddr] +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Load +
+ + + + + Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits of integer data from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits of integer data from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits of integer data from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX +
immintrin.h
+ Store +
+ + + + + + Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations. + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +MEM[loaddr+127:loaddr] := a[127:0] +MEM[hiaddr+127:hiaddr] := a[255:128] + + AVX +
immintrin.h
+ Store +
+ + + + + + Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations. + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +MEM[loaddr+127:loaddr] := a[127:0] +MEM[hiaddr+127:hiaddr] := a[255:128] + + AVX +
immintrin.h
+ Store +
+ + + + + + Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations. + "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. + +MEM[loaddr+127:loaddr] := a[127:0] +MEM[hiaddr+127:hiaddr] := a[255:128] + + AVX +
immintrin.h
+ Store +
+ + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[63:32] +dst[63:32] := a[63:32] +dst[95:64] := a[127:96] +dst[127:96] := a[127:96] +dst[159:128] := a[191:160] +dst[191:160] := a[191:160] +dst[223:192] := a[255:224] +dst[255:224] := a[255:224] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[31:0] +dst[63:32] := a[31:0] +dst[95:64] := a[95:64] +dst[127:96] := a[95:64] +dst[159:128] := a[159:128] +dst[191:160] := a[159:128] +dst[223:192] := a[223:192] +dst[255:224] := a[223:192] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". + +dst[63:0] := a[63:0] +dst[127:64] := a[63:0] +dst[191:128] := a[191:128] +dst[255:192] := a[191:128] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Move +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := 1.0 / a[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Elementary Math Functions +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". + +FOR j := 0 to 3 + i := j*64 + IF a[i+63] + dst[j] := 1 + ELSE + dst[j] := 0 + FI +ENDFOR +dst[MAX:4] := 0 + + + AVX +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". + +FOR j := 0 to 7 + i := j*32 + IF a[i+31] + dst[j] := 1 + ELSE + dst[j] := 0 + FI +ENDFOR +dst[MAX:8] := 0 + + + AVX +
immintrin.h
+ Miscellaneous +
+ + + + Return vector of type __m256d with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + Return vector of type __m256 with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + Return vector of type __m256i with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. + +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 +dst[71:64] := e8 +dst[79:72] := e9 +dst[87:80] := e10 +dst[95:88] := e11 +dst[103:96] := e12 +dst[111:104] := e13 +dst[119:112] := e14 +dst[127:120] := e15 +dst[135:128] := e16 +dst[143:136] := e17 +dst[151:144] := e18 +dst[159:152] := e19 +dst[167:160] := e20 +dst[175:168] := e21 +dst[183:176] := e22 +dst[191:184] := e23 +dst[199:192] := e24 +dst[207:200] := e25 +dst[215:208] := e26 +dst[223:216] := e27 +dst[231:224] := e28 +dst[239:232] := e29 +dst[247:240] := e30 +dst[255:248] := e31 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values. + +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 +dst[79:64] := e4 +dst[95:80] := e5 +dst[111:96] := e6 +dst[127:112] := e7 +dst[143:128] := e8 +dst[159:144] := e9 +dst[175:160] := e10 +dst[191:176] := e11 +dst[207:192] := e12 +dst[223:208] := e13 +dst[239:224] := e14 +dst[255:240] := e15 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + Set packed 64-bit integers in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[63:0] := e3 +dst[127:64] := e2 +dst[191:128] := e1 +dst[255:192] := e0 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[31:0] := e7 +dst[63:32] := e6 +dst[95:64] := e5 +dst[127:96] := e4 +dst[159:128] := e3 +dst[191:160] := e2 +dst[223:192] := e1 +dst[255:224] := e0 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values in reverse order. + +dst[7:0] := e31 +dst[15:8] := e30 +dst[23:16] := e29 +dst[31:24] := e28 +dst[39:32] := e27 +dst[47:40] := e26 +dst[55:48] := e25 +dst[63:56] := e24 +dst[71:64] := e23 +dst[79:72] := e22 +dst[87:80] := e21 +dst[95:88] := e20 +dst[103:96] := e19 +dst[111:104] := e18 +dst[119:112] := e17 +dst[127:120] := e16 +dst[135:128] := e15 +dst[143:136] := e14 +dst[151:144] := e13 +dst[159:152] := e12 +dst[167:160] := e11 +dst[175:168] := e10 +dst[183:176] := e9 +dst[191:184] := e8 +dst[199:192] := e7 +dst[207:200] := e6 +dst[215:208] := e5 +dst[223:216] := e4 +dst[231:224] := e3 +dst[239:232] := e2 +dst[247:240] := e1 +dst[255:248] := e0 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values in reverse order. + +dst[15:0] := e15 +dst[31:16] := e14 +dst[47:32] := e13 +dst[63:48] := e12 +dst[79:64] := e11 +dst[95:80] := e10 +dst[111:96] := e9 +dst[127:112] := e8 +dst[143:128] := e7 +dst[159:144] := e6 +dst[175:160] := e5 +dst[191:176] := e4 +dst[207:192] := e3 +dst[223:208] := e2 +dst[239:224] := e1 +dst[255:240] := e0 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e7 +dst[63:32] := e6 +dst[95:64] := e5 +dst[127:96] := e4 +dst[159:128] := e3 +dst[191:160] := e2 +dst[223:192] := e1 +dst[255:224] := e0 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + + + Set packed 64-bit integers in "dst" with the supplied values in reverse order. + +dst[63:0] := e3 +dst[127:64] := e2 +dst[191:128] := e1 +dst[255:192] := e0 +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:256] := 0 + + AVX +
immintrin.h
+ Set +
+ + + + + Set packed __m256 vector "dst" with the supplied values. + +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + + Set packed __m256d vector "dst" with the supplied values. + +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + + Set packed __m256i vector "dst" with the supplied values. + +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + + Set packed __m256 vector "dst" with the supplied values. + +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + + Set packed __m256d vector "dst" with the supplied values. + +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + + Set packed __m256i vector "dst" with the supplied values. + +dst[127:0] := lo[127:0] +dst[255:128] := hi[127:0] +dst[MAX:256] := 0 + + + AVX +
immintrin.h
+ Set +
+ + + + Cast vector of type __m256d to type __m256. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m256d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX +
immintrin.h
+ Cast +
+ + + + + + + Extract an 8-bit integer from "a", selected with "index", and store the result in "dst". + +dst[7:0] := (a[255:0] >> (index[4:0] * 8))[7:0] + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Extract a 16-bit integer from "a", selected with "index", and store the result in "dst". + +dst[15:0] := (a[255:0] >> (index[3:0] * 16))[15:0] + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed 16-bit integers from "a" and "b" within 128-bit lanes using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF imm8[j%8] + dst[i+15:i] := b[i+15:i] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF imm8[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + IF mask[i+7] + dst[i+7:i] := b[i+7:i] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". + +dst[127:0] := a[127:0] +dst[255:128] := a[127:0] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". + +dst[127:0] := a[127:0] +dst[255:128] := a[127:0] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src1, src2, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src1[127:0] + 1: tmp[127:0] := src1[255:128] + 2: tmp[127:0] := src2[127:0] + 3: tmp[127:0] := src2[255:128] + ESAC + IF control[3] + tmp[127:0] := 0 + FI + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) +dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI + IF b[128+i+7] == 1 + dst[128+i+7:128+i] := 0 + ELSE + index[3:0] := b[128+i+3:128+i] + dst[128+i+7:128+i] := a[128+index*8+7:128+index*8] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". + +dst[63:0] := a[63:0] +dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +dst[191:128] := a[191:128] +dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". + +dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +dst[127:64] := a[127:64] +dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +dst[255:192] := a[255:192] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Swizzle +
+ + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := ABS(a[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ABS(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ABS(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Special Math Functions +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". + +dst[15:0] := a[31:16] + a[15:0] +dst[31:16] := a[63:48] + a[47:32] +dst[47:32] := a[95:80] + a[79:64] +dst[63:48] := a[127:112] + a[111:96] +dst[79:64] := b[31:16] + b[15:0] +dst[95:80] := b[63:48] + b[47:32] +dst[111:96] := b[95:80] + b[79:64] +dst[127:112] := b[127:112] + b[111:96] +dst[143:128] := a[159:144] + a[143:128] +dst[159:144] := a[191:176] + a[175:160] +dst[175:160] := a[223:208] + a[207:192] +dst[191:176] := a[255:240] + a[239:224] +dst[207:192] := b[159:144] + b[143:128] +dst[223:208] := b[191:176] + b[175:160] +dst[239:224] := b[223:208] + b[207:192] +dst[255:240] := b[255:240] + b[239:224] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". + +dst[31:0] := a[63:32] + a[31:0] +dst[63:32] := a[127:96] + a[95:64] +dst[95:64] := b[63:32] + b[31:0] +dst[127:96] := b[127:96] + b[95:64] +dst[159:128] := a[191:160] + a[159:128] +dst[191:160] := a[255:224] + a[223:192] +dst[223:192] := b[191:160] + b[159:128] +dst[255:224] := b[255:224] + b[223:192] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". + +dst[15:0] := Saturate16(a[31:16] + a[15:0]) +dst[31:16] := Saturate16(a[63:48] + a[47:32]) +dst[47:32] := Saturate16(a[95:80] + a[79:64]) +dst[63:48] := Saturate16(a[127:112] + a[111:96]) +dst[79:64] := Saturate16(b[31:16] + b[15:0]) +dst[95:80] := Saturate16(b[63:48] + b[47:32]) +dst[111:96] := Saturate16(b[95:80] + b[79:64]) +dst[127:112] := Saturate16(b[127:112] + b[111:96]) +dst[143:128] := Saturate16(a[159:144] + a[143:128]) +dst[159:144] := Saturate16(a[191:176] + a[175:160]) +dst[175:160] := Saturate16(a[223:208] + a[207:192]) +dst[191:176] := Saturate16(a[255:240] + a[239:224]) +dst[207:192] := Saturate16(b[159:144] + b[143:128]) +dst[223:208] := Saturate16(b[191:176] + b[175:160]) +dst[239:224] := Saturate16(b[223:208] + b[207:192]) +dst[255:240] := Saturate16(b[255:240] + b[239:224]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". + +dst[15:0] := a[15:0] - a[31:16] +dst[31:16] := a[47:32] - a[63:48] +dst[47:32] := a[79:64] - a[95:80] +dst[63:48] := a[111:96] - a[127:112] +dst[79:64] := b[15:0] - b[31:16] +dst[95:80] := b[47:32] - b[63:48] +dst[111:96] := b[79:64] - b[95:80] +dst[127:112] := b[111:96] - b[127:112] +dst[143:128] := a[143:128] - a[159:144] +dst[159:144] := a[175:160] - a[191:176] +dst[175:160] := a[207:192] - a[223:208] +dst[191:176] := a[239:224] - a[255:240] +dst[207:192] := b[143:128] - b[159:144] +dst[223:208] := b[175:160] - b[191:176] +dst[239:224] := b[207:192] - b[223:208] +dst[255:240] := b[239:224] - b[255:240] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". + +dst[31:0] := a[31:0] - a[63:32] +dst[63:32] := a[95:64] - a[127:96] +dst[95:64] := b[31:0] - b[63:32] +dst[127:96] := b[95:64] - b[127:96] +dst[159:128] := a[159:128] - a[191:160] +dst[191:160] := a[223:192] - a[255:224] +dst[223:192] := b[159:128] - b[191:160] +dst[255:224] := b[223:192] - b[255:224] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". + +dst[15:0] := Saturate16(a[15:0] - a[31:16]) +dst[31:16] := Saturate16(a[47:32] - a[63:48]) +dst[47:32] := Saturate16(a[79:64] - a[95:80]) +dst[63:48] := Saturate16(a[111:96] - a[127:112]) +dst[79:64] := Saturate16(b[15:0] - b[31:16]) +dst[95:80] := Saturate16(b[47:32] - b[63:48]) +dst[111:96] := Saturate16(b[79:64] - b[95:80]) +dst[127:112] := Saturate16(b[111:96] - b[127:112]) +dst[143:128] := Saturate16(a[143:128] - a[159:144]) +dst[159:144] := Saturate16(a[175:160] - a[191:176]) +dst[175:160] := Saturate16(a[207:192] - a[223:208]) +dst[191:176] := Saturate16(a[239:224] - a[255:240]) +dst[207:192] := Saturate16(b[143:128] - b[159:144]) +dst[223:208] := Saturate16(b[175:160] - b[191:176]) +dst[239:224] := Saturate16(b[207:192] - b[223:208]) +dst[255:240] := Saturate16(b[239:224] - b[255:240]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+31:i] * b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*32 + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". + +FOR j := 0 to 31 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR +FOR j := 0 to 3 + i := j*64 + dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] + dst[i+63:i+16] := 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 31 + i := j*8 + IF b[i+7:i] < 0 + dst[i+7:i] := -(a[i+7:i]) + ELSE IF b[i+7:i] == 0 + dst[i+7:i] := 0 + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 15 + i := j*16 + IF b[i+15:i] < 0 + dst[i+15:i] := -(a[i+15:i]) + ELSE IF b[i+15:i] == 0 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 7 + i := j*32 + IF b[i+31:i] < 0 + dst[i+31:i] := -(a[i+31:i]) + ELSE IF b[i+31:i] == 0 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Arithmetic +
+ + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". + +FOR j := 0 to 1 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + dst[i+127:i] := tmp[127:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[j] := a[i+7] +ENDFOR + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". + +DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { + a_offset := imm8[2]*32 + b_offset := imm8[1:0]*32 + FOR j := 0 to 7 + i := j*8 + k := a_offset+i + l := b_offset + tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ + ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) + ENDFOR + RETURN tmp[127:0] +} +dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) +dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". + +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(a[79:64]) +dst[47:40] := Saturate8(a[95:80]) +dst[55:48] := Saturate8(a[111:96]) +dst[63:56] := Saturate8(a[127:112]) +dst[71:64] := Saturate8(b[15:0]) +dst[79:72] := Saturate8(b[31:16]) +dst[87:80] := Saturate8(b[47:32]) +dst[95:88] := Saturate8(b[63:48]) +dst[103:96] := Saturate8(b[79:64]) +dst[111:104] := Saturate8(b[95:80]) +dst[119:112] := Saturate8(b[111:96]) +dst[127:120] := Saturate8(b[127:112]) +dst[135:128] := Saturate8(a[143:128]) +dst[143:136] := Saturate8(a[159:144]) +dst[151:144] := Saturate8(a[175:160]) +dst[159:152] := Saturate8(a[191:176]) +dst[167:160] := Saturate8(a[207:192]) +dst[175:168] := Saturate8(a[223:208]) +dst[183:176] := Saturate8(a[239:224]) +dst[191:184] := Saturate8(a[255:240]) +dst[199:192] := Saturate8(b[143:128]) +dst[207:200] := Saturate8(b[159:144]) +dst[215:208] := Saturate8(b[175:160]) +dst[223:216] := Saturate8(b[191:176]) +dst[231:224] := Saturate8(b[207:192]) +dst[239:232] := Saturate8(b[223:208]) +dst[247:240] := Saturate8(b[239:224]) +dst[255:248] := Saturate8(b[255:240]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". + +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(a[95:64]) +dst[63:48] := Saturate16(a[127:96]) +dst[79:64] := Saturate16(b[31:0]) +dst[95:80] := Saturate16(b[63:32]) +dst[111:96] := Saturate16(b[95:64]) +dst[127:112] := Saturate16(b[127:96]) +dst[143:128] := Saturate16(a[159:128]) +dst[159:144] := Saturate16(a[191:160]) +dst[175:160] := Saturate16(a[223:192]) +dst[191:176] := Saturate16(a[255:224]) +dst[207:192] := Saturate16(b[159:128]) +dst[223:208] := Saturate16(b[191:160]) +dst[239:224] := Saturate16(b[223:192]) +dst[255:240] := Saturate16(b[255:224]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(a[79:64]) +dst[47:40] := SaturateU8(a[95:80]) +dst[55:48] := SaturateU8(a[111:96]) +dst[63:56] := SaturateU8(a[127:112]) +dst[71:64] := SaturateU8(b[15:0]) +dst[79:72] := SaturateU8(b[31:16]) +dst[87:80] := SaturateU8(b[47:32]) +dst[95:88] := SaturateU8(b[63:48]) +dst[103:96] := SaturateU8(b[79:64]) +dst[111:104] := SaturateU8(b[95:80]) +dst[119:112] := SaturateU8(b[111:96]) +dst[127:120] := SaturateU8(b[127:112]) +dst[135:128] := SaturateU8(a[143:128]) +dst[143:136] := SaturateU8(a[159:144]) +dst[151:144] := SaturateU8(a[175:160]) +dst[159:152] := SaturateU8(a[191:176]) +dst[167:160] := SaturateU8(a[207:192]) +dst[175:168] := SaturateU8(a[223:208]) +dst[183:176] := SaturateU8(a[239:224]) +dst[191:184] := SaturateU8(a[255:240]) +dst[199:192] := SaturateU8(b[143:128]) +dst[207:200] := SaturateU8(b[159:144]) +dst[215:208] := SaturateU8(b[175:160]) +dst[223:216] := SaturateU8(b[191:176]) +dst[231:224] := SaturateU8(b[207:192]) +dst[239:232] := SaturateU8(b[223:208]) +dst[247:240] := SaturateU8(b[239:224]) +dst[255:248] := SaturateU8(b[255:240]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". + +dst[15:0] := SaturateU16(a[31:0]) +dst[31:16] := SaturateU16(a[63:32]) +dst[47:32] := SaturateU16(a[95:64]) +dst[63:48] := SaturateU16(a[127:96]) +dst[79:64] := SaturateU16(b[31:0]) +dst[95:80] := SaturateU16(b[63:32]) +dst[111:96] := SaturateU16(b[95:64]) +dst[127:112] := SaturateU16(b[127:96]) +dst[143:128] := SaturateU16(a[159:128]) +dst[159:144] := SaturateU16(a[191:160]) +dst[175:160] := SaturateU16(a[223:192]) +dst[191:176] := SaturateU16(a[255:224]) +dst[207:192] := SaturateU16(b[159:128]) +dst[223:208] := SaturateU16(b[191:160]) +dst[239:224] := SaturateU16(b[223:192]) +dst[255:240] := SaturateU16(b[255:224]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Miscellaneous +
+ + + + + Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[255:0] := (a[255:0] AND b[255:0]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[255:0] := ((NOT a[255:0]) AND b[255:0]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[255:0] := (a[255:0] OR b[255:0]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[255:0] := (a[255:0] XOR b[255:0]) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Logical +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Probability/Statistics +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Compare +
+ + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j:= 0 to 7 + i := 32*j + k := 16*j + dst[i+31:i] := SignExtend32(a[k+15:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j:= 0 to 3 + i := 64*j + k := 16*j + dst[i+63:i] := SignExtend64(a[k+15:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j:= 0 to 3 + i := 64*j + k := 32*j + dst[i+63:i] := SignExtend64(a[k+31:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + l := j*16 + dst[l+15:l] := SignExtend16(a[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[i+31:i] := SignExtend32(a[k+7:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[i+63:i] := SignExtend64(a[k+7:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[i+31:i] := ZeroExtend32(a[k+15:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j:= 0 to 3 + i := 64*j + k := 16*j + dst[i+63:i] := ZeroExtend64(a[k+15:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j:= 0 to 3 + i := 64*j + k := 32*j + dst[i+63:i] := ZeroExtend64(a[k+31:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + l := j*16 + dst[l+15:l] := ZeroExtend16(a[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[i+31:i] := ZeroExtend32(a[k+7:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[i+63:i] := ZeroExtend64(a[k+7:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Convert +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:64] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:64] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF mask[i+31] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF mask[i+63] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:64] := 0 +dst[MAX:64] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:64] := 0 +dst[MAX:64] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF mask[i+31] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:128] := 0 +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF mask[i+63] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +mask[MAX:256] := 0 +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Load +
+ + + + + + Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 7 + i := j*32 + IF mask[i+31] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 3 + i := j*64 + IF mask[i+63] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX2 +
immintrin.h
+ Store +
+ + + + + Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) +dst[255:128] := a[255:128] << (tmp*8) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) +dst[255:128] := a[255:128] << (tmp*8) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) +dst[255:128] := a[255:128] >> (tmp*8) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) +dst[255:128] := a[255:128] >> (tmp*8) +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX2 +
immintrin.h
+ Shift +
+ + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +FOR i := 0 to 1 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 3 + i := j*64 + dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +FOR i := 0 to 1 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 3 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +FOR i := 0 to 1 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 3 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +tmp.dword[0] := b.dword[ imm8[1:0] ] +tmp.dword[1] := b.dword[ imm8[3:2] ] +tmp.dword[2] := b.dword[ imm8[5:4] ] +tmp.dword[3] := b.dword[ imm8[7:6] ] +FOR j := 0 to 1 + i := j*64 + dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +tmp.dword[0] := b.dword[ imm8[1:0] ] +tmp.dword[1] := b.dword[ imm8[3:2] ] +tmp.dword[2] := b.dword[ imm8[5:4] ] +tmp.dword[3] := b.dword[ imm8[7:6] ] +FOR j := 0 to 1 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +tmp.dword[0] := b.dword[ imm8[1:0] ] +tmp.dword[1] := b.dword[ imm8[3:2] ] +tmp.dword[2] := b.dword[ imm8[5:4] ] +tmp.dword[3] := b.dword[ imm8[7:6] ] +FOR j := 0 to 1 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := b[i+7:i] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := b[i+7:i] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := b[i+15:i] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := b[i+15:i] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := idx[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + off := 16*idx[i+3:i] + dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := idx[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + off := 16*idx[i+2:i] + dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] +ENDFOR +dst[MAX:128] := 0 + + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + id := idx[i+3:i]*16 + IF k[j] + dst[i+15:i] := a[id+15:id] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + id := idx[i+3:i]*16 + IF k[j] + dst[i+15:i] := a[id+15:id] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + id := idx[i+3:i]*16 + dst[i+15:i] := a[id+15:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i]*16 + IF k[j] + dst[i+15:i] := a[id+15:id] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i]*16 + IF k[j] + dst[i+15:i] := a[id+15:id] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i]*16 + dst[i+15:i] := a[id+15:id] +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". + +FOR j := 0 to 31 + i := j*8 + IF a[i+7] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". + +FOR j := 0 to 15 + i := j*8 + IF a[i+7] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := 0xFF + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := 0xFF + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := 0xFFFF + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := 0xFFFF + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". + +FOR j := 0 to 15 + i := j*16 + IF a[i+15] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". + +FOR j := 0 to 7 + i := j*16 + IF a[i+15] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[4:0] := b[i+3:i] + (j & 0x10) + dst[i+7:i] := a[index*8+7:index*8] + FI + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[4:0] := b[i+3:i] + (j & 0x10) + dst[i+7:i] := a[index*8+7:index*8] + FI + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 16 packed 16-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 32 packed 8-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 8 packed 16-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 16 packed 8-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Store packed 16-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*16 + IF k[j] + MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 16-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*16 + IF k[j] + MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 8-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 31 + i := j*8 + IF k[j] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 8-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*8 + IF k[j] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 16 packed 16-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 32 packed 8-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 8 packed 16-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 16 packed 8-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512BW + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := ABS(a[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := ABS(a[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := ABS(a[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := ABS(a[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ABS(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ABS(a[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ABS(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ABS(a[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + b[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + b[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + b[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + b[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 16*j + l := 8*j + dst[l+7:l] := Saturate8(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + dst[l+7:l] := Saturate8(a[i+15:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 16*j + l := 8*j + dst[l+7:l] := SaturateU8(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + dst[l+7:l] := SaturateU8(a[i+15:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 16*j + l := 8*j + dst[l+7:l] := Truncate8(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + dst[l+7:l] := Truncate8(a[i+15:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 31 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 7 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 31 + i := j*8 + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 31 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 15 + i := j*8 + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 15 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 15 + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 15 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 7 + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 7 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512BW + AVX512VL +
immintrin.h
+ Shift +
+ + + + Reduce the packed 16-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[15:0] + src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_ADD(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_ADD(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[15:0] + src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_ADD(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0 + FI +ENDFOR +dst[15:0] := REDUCE_ADD(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[15:0] + src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_ADD(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_ADD(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[15:0] + src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_ADD(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0 + FI +ENDFOR +dst[15:0] := REDUCE_ADD(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[7:0] + src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_ADD(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_ADD(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[7:0] + src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_ADD(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0 + FI +ENDFOR +dst[7:0] := REDUCE_ADD(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[7:0] + src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_ADD(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_ADD(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[7:0] + src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_ADD(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0 + FI +ENDFOR +dst[7:0] := REDUCE_ADD(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[15:0] * src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_MUL(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MUL(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[15:0] * src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_MUL(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 1 + FI +ENDFOR +dst[15:0] := REDUCE_MUL(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[15:0] * src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_MUL(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MUL(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[15:0] * src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_MUL(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 1 + FI +ENDFOR +dst[15:0] := REDUCE_MUL(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[7:0] * src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_MUL(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MUL(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[7:0] * src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_MUL(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 1 + FI +ENDFOR +dst[7:0] := REDUCE_MUL(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[7:0] * src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_MUL(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MUL(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[7:0] * src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_MUL(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 1 + FI +ENDFOR +dst[7:0] := REDUCE_MUL(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[15:0] OR src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_OR(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_OR(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[15:0] OR src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_OR(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0 + FI +ENDFOR +dst[15:0] := REDUCE_OR(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[15:0] OR src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_OR(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_OR(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[15:0] OR src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_OR(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0 + FI +ENDFOR +dst[15:0] := REDUCE_OR(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[7:0] OR src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_OR(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_OR(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[7:0] OR src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_OR(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0 + FI +ENDFOR +dst[7:0] := REDUCE_OR(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[7:0] OR src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_OR(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_OR(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[7:0] OR src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_OR(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0 + FI +ENDFOR +dst[7:0] := REDUCE_OR(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[15:0] AND src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_AND(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_AND(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[15:0] AND src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_AND(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0xFFFF + FI +ENDFOR +dst[15:0] := REDUCE_AND(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[15:0] AND src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len] + ENDFOR + RETURN REDUCE_AND(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_AND(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[15:0] AND src[31:16] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len] + ENDFOR + RETURN REDUCE_AND(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0xFFFF + FI +ENDFOR +dst[15:0] := REDUCE_AND(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[7:0] AND src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_AND(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_AND(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[7:0] AND src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_AND(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0xFF + FI +ENDFOR +dst[7:0] := REDUCE_AND(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[7:0] AND src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len] + ENDFOR + RETURN REDUCE_AND(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_AND(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[7:0] AND src[15:8] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len] + ENDFOR + RETURN REDUCE_AND(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0xFF + FI +ENDFOR +dst[7:0] := REDUCE_AND(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed signed 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MAX(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := Int16(-0x8000) + FI +ENDFOR +dst[15:0] := REDUCE_MAX(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MAX(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := Int16(-0x8000) + FI +ENDFOR +dst[15:0] := REDUCE_MAX(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MAX(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := Int8(-0x80) + FI +ENDFOR +dst[7:0] := REDUCE_MAX(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MAX(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := Int8(-0x80) + FI +ENDFOR +dst[7:0] := REDUCE_MAX(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MAX(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0 + FI +ENDFOR +dst[15:0] := REDUCE_MAX(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MAX(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MAX(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0 + FI +ENDFOR +dst[15:0] := REDUCE_MAX(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MAX(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0 + FI +ENDFOR +dst[7:0] := REDUCE_MAX(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MAX(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MAX(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0 + FI +ENDFOR +dst[7:0] := REDUCE_MAX(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MIN(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := Int16(0x7FFF) + FI +ENDFOR +dst[15:0] := REDUCE_MIN(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MIN(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := Int16(0x7FFF) + FI +ENDFOR +dst[15:0] := REDUCE_MIN(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MIN(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := Int8(0x7F) + FI +ENDFOR +dst[7:0] := REDUCE_MIN(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MIN(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 31 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := Int8(0x7F) + FI +ENDFOR +dst[7:0] := REDUCE_MIN(tmp, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MIN(a, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0xFFFF + FI +ENDFOR +dst[15:0] := REDUCE_MIN(tmp, 8) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +dst[15:0] := REDUCE_MIN(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*16 + src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) + ENDFOR + RETURN REDUCE_MIN(src[16*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[i+15:i] := a[i+15:i] + ELSE + tmp[i+15:i] := 0xFFFF + FI +ENDFOR +dst[15:0] := REDUCE_MIN(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MIN(a, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0xFF + FI +ENDFOR +dst[7:0] := REDUCE_MIN(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +dst[7:0] := REDUCE_MIN(a, 32) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*8 + src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) + ENDFOR + RETURN REDUCE_MIN(src[8*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*8 + IF k[j] + tmp[i+7:i] := a[i+7:i] + ELSE + tmp[i+7:i] := 0xFF + FI +ENDFOR +dst[7:0] := REDUCE_MIN(tmp, 16) + + AVX512BW + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "dst". + +dst[31:0] := b[31:0] +dst[63:32] := a[31:0] +dst[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "dst". + +dst[15:0] := b[15:0] +dst[31:16] := a[15:0] +dst[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +FOR i := 0 to 3 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 7 + i := j*64 + dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +FOR i := 0 to 3 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 7 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. + +FOR i := 0 to 3 + tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] + tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] + tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] + tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] +ENDFOR +FOR j := 0 to 7 + i := j*64 + tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) + + tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) + + tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) + + tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) +ENDFOR +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". + +FOR j := 0 to 3 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + dst[i+127:i] := tmp[127:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*128 + tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) + tmp_dst[i+127:i] := tmp[127:0] +ENDFOR +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := b[i+7:i] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := b[i+15:i] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := idx[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + off := 16*idx[i+4:i] + dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + id := idx[i+4:i]*16 + IF k[j] + dst[i+15:i] := a[id+15:id] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + id := idx[i+4:i]*16 + IF k[j] + dst[i+15:i] := a[id+15:id] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + id := idx[i+4:i]*16 + dst[i+15:i] := a[id+15:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". + +FOR j := 0 to 63 + i := j*8 + IF a[i+7] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := 0xFF + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := 0xFFFF + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". + +FOR j := 0 to 31 + i := j*16 + IF a[i+15] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". + +FOR j := 0 to 63 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR +FOR j := 0 to 7 + i := j*64 + dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] + dst[i+63:i+16] := 0 +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[5:0] := b[i+3:i] + (j & 0x30) + dst[i+7:i] := a[index*8+7:index*8] + FI + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[5:0] := b[i+3:i] + (j & 0x30) + dst[i+7:i] := a[index*8+7:index*8] + FI + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Swizzle +
+ + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[5:0] := b[i+3:i] + (j & 0x30) + dst[i+7:i] := a[index*8+7:index*8] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +tmp_dst[319:256] := a[319:256] +tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] +tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] +tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] +tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] +tmp_dst[447:384] := a[447:384] +tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] +tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] +tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] +tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[63:0] := a[63:0] +tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +tmp_dst[191:128] := a[191:128] +tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +tmp_dst[319:256] := a[319:256] +tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] +tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] +tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] +tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] +tmp_dst[447:384] := a[447:384] +tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] +tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] +tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] +tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". + +dst[63:0] := a[63:0] +dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] +dst[191:128] := a[191:128] +dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] +dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] +dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] +dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] +dst[319:256] := a[319:256] +dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] +dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] +dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] +dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] +dst[447:384] := a[447:384] +dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] +dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] +dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] +dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] +tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] +tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] +tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] +tmp_dst[383:320] := a[383:320] +tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] +tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] +tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] +tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] +tmp_dst[511:448] := a[511:448] +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +tmp_dst[127:64] := a[127:64] +tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +tmp_dst[255:192] := a[255:192] +tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] +tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] +tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] +tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] +tmp_dst[383:320] := a[383:320] +tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] +tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] +tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] +tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] +tmp_dst[511:448] := a[511:448] +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". + +dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +dst[127:64] := a[127:64] +dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] +dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] +dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] +dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] +dst[255:192] := a[255:192] +dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] +dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] +dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] +dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] +dst[383:320] := a[383:320] +dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] +dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] +dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] +dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] +dst[511:448] := a[511:448] +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Miscellaneous +
+ + + + + + Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Load +
+ + + + + Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Load +
+ + + + + + Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Load +
+ + + + + Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 32 packed 16-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 64 packed 8-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Load +
+ + + + Load 32-bit mask from memory into "k". + +k[31:0] := MEM[mem_addr+31:mem_addr] + + + AVX512BW +
immintrin.h
+ Load +
+ + + + Load 64-bit mask from memory into "k". + +k[63:0] := MEM[mem_addr+63:mem_addr] + + + AVX512BW +
immintrin.h
+ Load +
+ + + + + + Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Move +
+ + + + + Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Move +
+ + + + + + Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Move +
+ + + + + Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Move +
+ + + + + + Store packed 16-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 31 + i := j*16 + IF k[j] + MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] + FI +ENDFOR + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + + Store packed 8-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 63 + i := j*8 + IF k[j] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 32 packed 16-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 64 packed 8-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 32-bit mask from "a" into memory. + +MEM[mem_addr+31:mem_addr] := a[31:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + + Store 64-bit mask from "a" into memory. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + AVX512BW +
immintrin.h
+ Store +
+ + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := ABS(a[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := ABS(a[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := ABS(a[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ABS(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ABS(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ABS(a[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] + b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + b[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] + b[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[i+7:i] - b[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[i+15:i] - b[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) +tmp_dst[271:256] := Saturate16(a[287:256]) +tmp_dst[287:272] := Saturate16(a[319:288]) +tmp_dst[303:288] := Saturate16(a[351:320]) +tmp_dst[319:304] := Saturate16(a[383:352]) +tmp_dst[335:320] := Saturate16(b[287:256]) +tmp_dst[351:336] := Saturate16(b[319:288]) +tmp_dst[367:352] := Saturate16(b[351:320]) +tmp_dst[383:368] := Saturate16(b[383:352]) +tmp_dst[399:384] := Saturate16(a[415:384]) +tmp_dst[415:400] := Saturate16(a[447:416]) +tmp_dst[431:416] := Saturate16(a[479:448]) +tmp_dst[447:432] := Saturate16(a[511:480]) +tmp_dst[463:448] := Saturate16(b[415:384]) +tmp_dst[479:464] := Saturate16(b[447:416]) +tmp_dst[495:480] := Saturate16(b[479:448]) +tmp_dst[511:496] := Saturate16(b[511:480]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := Saturate16(a[31:0]) +tmp_dst[31:16] := Saturate16(a[63:32]) +tmp_dst[47:32] := Saturate16(a[95:64]) +tmp_dst[63:48] := Saturate16(a[127:96]) +tmp_dst[79:64] := Saturate16(b[31:0]) +tmp_dst[95:80] := Saturate16(b[63:32]) +tmp_dst[111:96] := Saturate16(b[95:64]) +tmp_dst[127:112] := Saturate16(b[127:96]) +tmp_dst[143:128] := Saturate16(a[159:128]) +tmp_dst[159:144] := Saturate16(a[191:160]) +tmp_dst[175:160] := Saturate16(a[223:192]) +tmp_dst[191:176] := Saturate16(a[255:224]) +tmp_dst[207:192] := Saturate16(b[159:128]) +tmp_dst[223:208] := Saturate16(b[191:160]) +tmp_dst[239:224] := Saturate16(b[223:192]) +tmp_dst[255:240] := Saturate16(b[255:224]) +tmp_dst[271:256] := Saturate16(a[287:256]) +tmp_dst[287:272] := Saturate16(a[319:288]) +tmp_dst[303:288] := Saturate16(a[351:320]) +tmp_dst[319:304] := Saturate16(a[383:352]) +tmp_dst[335:320] := Saturate16(b[287:256]) +tmp_dst[351:336] := Saturate16(b[319:288]) +tmp_dst[367:352] := Saturate16(b[351:320]) +tmp_dst[383:368] := Saturate16(b[383:352]) +tmp_dst[399:384] := Saturate16(a[415:384]) +tmp_dst[415:400] := Saturate16(a[447:416]) +tmp_dst[431:416] := Saturate16(a[479:448]) +tmp_dst[447:432] := Saturate16(a[511:480]) +tmp_dst[463:448] := Saturate16(b[415:384]) +tmp_dst[479:464] := Saturate16(b[447:416]) +tmp_dst[495:480] := Saturate16(b[479:448]) +tmp_dst[511:496] := Saturate16(b[511:480]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". + +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(a[95:64]) +dst[63:48] := Saturate16(a[127:96]) +dst[79:64] := Saturate16(b[31:0]) +dst[95:80] := Saturate16(b[63:32]) +dst[111:96] := Saturate16(b[95:64]) +dst[127:112] := Saturate16(b[127:96]) +dst[143:128] := Saturate16(a[159:128]) +dst[159:144] := Saturate16(a[191:160]) +dst[175:160] := Saturate16(a[223:192]) +dst[191:176] := Saturate16(a[255:224]) +dst[207:192] := Saturate16(b[159:128]) +dst[223:208] := Saturate16(b[191:160]) +dst[239:224] := Saturate16(b[223:192]) +dst[255:240] := Saturate16(b[255:224]) +dst[271:256] := Saturate16(a[287:256]) +dst[287:272] := Saturate16(a[319:288]) +dst[303:288] := Saturate16(a[351:320]) +dst[319:304] := Saturate16(a[383:352]) +dst[335:320] := Saturate16(b[287:256]) +dst[351:336] := Saturate16(b[319:288]) +dst[367:352] := Saturate16(b[351:320]) +dst[383:368] := Saturate16(b[383:352]) +dst[399:384] := Saturate16(a[415:384]) +dst[415:400] := Saturate16(a[447:416]) +dst[431:416] := Saturate16(a[479:448]) +dst[447:432] := Saturate16(a[511:480]) +dst[463:448] := Saturate16(b[415:384]) +dst[479:464] := Saturate16(b[447:416]) +dst[495:480] := Saturate16(b[479:448]) +dst[511:496] := Saturate16(b[511:480]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) +tmp_dst[263:256] := Saturate8(a[271:256]) +tmp_dst[271:264] := Saturate8(a[287:272]) +tmp_dst[279:272] := Saturate8(a[303:288]) +tmp_dst[287:280] := Saturate8(a[319:304]) +tmp_dst[295:288] := Saturate8(a[335:320]) +tmp_dst[303:296] := Saturate8(a[351:336]) +tmp_dst[311:304] := Saturate8(a[367:352]) +tmp_dst[319:312] := Saturate8(a[383:368]) +tmp_dst[327:320] := Saturate8(b[271:256]) +tmp_dst[335:328] := Saturate8(b[287:272]) +tmp_dst[343:336] := Saturate8(b[303:288]) +tmp_dst[351:344] := Saturate8(b[319:304]) +tmp_dst[359:352] := Saturate8(b[335:320]) +tmp_dst[367:360] := Saturate8(b[351:336]) +tmp_dst[375:368] := Saturate8(b[367:352]) +tmp_dst[383:376] := Saturate8(b[383:368]) +tmp_dst[391:384] := Saturate8(a[399:384]) +tmp_dst[399:392] := Saturate8(a[415:400]) +tmp_dst[407:400] := Saturate8(a[431:416]) +tmp_dst[415:408] := Saturate8(a[447:432]) +tmp_dst[423:416] := Saturate8(a[463:448]) +tmp_dst[431:424] := Saturate8(a[479:464]) +tmp_dst[439:432] := Saturate8(a[495:480]) +tmp_dst[447:440] := Saturate8(a[511:496]) +tmp_dst[455:448] := Saturate8(b[399:384]) +tmp_dst[463:456] := Saturate8(b[415:400]) +tmp_dst[471:464] := Saturate8(b[431:416]) +tmp_dst[479:472] := Saturate8(b[447:432]) +tmp_dst[487:480] := Saturate8(b[463:448]) +tmp_dst[495:488] := Saturate8(b[479:464]) +tmp_dst[503:496] := Saturate8(b[495:480]) +tmp_dst[511:504] := Saturate8(b[511:496]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[7:0] := Saturate8(a[15:0]) +tmp_dst[15:8] := Saturate8(a[31:16]) +tmp_dst[23:16] := Saturate8(a[47:32]) +tmp_dst[31:24] := Saturate8(a[63:48]) +tmp_dst[39:32] := Saturate8(a[79:64]) +tmp_dst[47:40] := Saturate8(a[95:80]) +tmp_dst[55:48] := Saturate8(a[111:96]) +tmp_dst[63:56] := Saturate8(a[127:112]) +tmp_dst[71:64] := Saturate8(b[15:0]) +tmp_dst[79:72] := Saturate8(b[31:16]) +tmp_dst[87:80] := Saturate8(b[47:32]) +tmp_dst[95:88] := Saturate8(b[63:48]) +tmp_dst[103:96] := Saturate8(b[79:64]) +tmp_dst[111:104] := Saturate8(b[95:80]) +tmp_dst[119:112] := Saturate8(b[111:96]) +tmp_dst[127:120] := Saturate8(b[127:112]) +tmp_dst[135:128] := Saturate8(a[143:128]) +tmp_dst[143:136] := Saturate8(a[159:144]) +tmp_dst[151:144] := Saturate8(a[175:160]) +tmp_dst[159:152] := Saturate8(a[191:176]) +tmp_dst[167:160] := Saturate8(a[207:192]) +tmp_dst[175:168] := Saturate8(a[223:208]) +tmp_dst[183:176] := Saturate8(a[239:224]) +tmp_dst[191:184] := Saturate8(a[255:240]) +tmp_dst[199:192] := Saturate8(b[143:128]) +tmp_dst[207:200] := Saturate8(b[159:144]) +tmp_dst[215:208] := Saturate8(b[175:160]) +tmp_dst[223:216] := Saturate8(b[191:176]) +tmp_dst[231:224] := Saturate8(b[207:192]) +tmp_dst[239:232] := Saturate8(b[223:208]) +tmp_dst[247:240] := Saturate8(b[239:224]) +tmp_dst[255:248] := Saturate8(b[255:240]) +tmp_dst[263:256] := Saturate8(a[271:256]) +tmp_dst[271:264] := Saturate8(a[287:272]) +tmp_dst[279:272] := Saturate8(a[303:288]) +tmp_dst[287:280] := Saturate8(a[319:304]) +tmp_dst[295:288] := Saturate8(a[335:320]) +tmp_dst[303:296] := Saturate8(a[351:336]) +tmp_dst[311:304] := Saturate8(a[367:352]) +tmp_dst[319:312] := Saturate8(a[383:368]) +tmp_dst[327:320] := Saturate8(b[271:256]) +tmp_dst[335:328] := Saturate8(b[287:272]) +tmp_dst[343:336] := Saturate8(b[303:288]) +tmp_dst[351:344] := Saturate8(b[319:304]) +tmp_dst[359:352] := Saturate8(b[335:320]) +tmp_dst[367:360] := Saturate8(b[351:336]) +tmp_dst[375:368] := Saturate8(b[367:352]) +tmp_dst[383:376] := Saturate8(b[383:368]) +tmp_dst[391:384] := Saturate8(a[399:384]) +tmp_dst[399:392] := Saturate8(a[415:400]) +tmp_dst[407:400] := Saturate8(a[431:416]) +tmp_dst[415:408] := Saturate8(a[447:432]) +tmp_dst[423:416] := Saturate8(a[463:448]) +tmp_dst[431:424] := Saturate8(a[479:464]) +tmp_dst[439:432] := Saturate8(a[495:480]) +tmp_dst[447:440] := Saturate8(a[511:496]) +tmp_dst[455:448] := Saturate8(b[399:384]) +tmp_dst[463:456] := Saturate8(b[415:400]) +tmp_dst[471:464] := Saturate8(b[431:416]) +tmp_dst[479:472] := Saturate8(b[447:432]) +tmp_dst[487:480] := Saturate8(b[463:448]) +tmp_dst[495:488] := Saturate8(b[479:464]) +tmp_dst[503:496] := Saturate8(b[495:480]) +tmp_dst[511:504] := Saturate8(b[511:496]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". + +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(a[79:64]) +dst[47:40] := Saturate8(a[95:80]) +dst[55:48] := Saturate8(a[111:96]) +dst[63:56] := Saturate8(a[127:112]) +dst[71:64] := Saturate8(b[15:0]) +dst[79:72] := Saturate8(b[31:16]) +dst[87:80] := Saturate8(b[47:32]) +dst[95:88] := Saturate8(b[63:48]) +dst[103:96] := Saturate8(b[79:64]) +dst[111:104] := Saturate8(b[95:80]) +dst[119:112] := Saturate8(b[111:96]) +dst[127:120] := Saturate8(b[127:112]) +dst[135:128] := Saturate8(a[143:128]) +dst[143:136] := Saturate8(a[159:144]) +dst[151:144] := Saturate8(a[175:160]) +dst[159:152] := Saturate8(a[191:176]) +dst[167:160] := Saturate8(a[207:192]) +dst[175:168] := Saturate8(a[223:208]) +dst[183:176] := Saturate8(a[239:224]) +dst[191:184] := Saturate8(a[255:240]) +dst[199:192] := Saturate8(b[143:128]) +dst[207:200] := Saturate8(b[159:144]) +dst[215:208] := Saturate8(b[175:160]) +dst[223:216] := Saturate8(b[191:176]) +dst[231:224] := Saturate8(b[207:192]) +dst[239:232] := Saturate8(b[223:208]) +dst[247:240] := Saturate8(b[239:224]) +dst[255:248] := Saturate8(b[255:240]) +dst[263:256] := Saturate8(a[271:256]) +dst[271:264] := Saturate8(a[287:272]) +dst[279:272] := Saturate8(a[303:288]) +dst[287:280] := Saturate8(a[319:304]) +dst[295:288] := Saturate8(a[335:320]) +dst[303:296] := Saturate8(a[351:336]) +dst[311:304] := Saturate8(a[367:352]) +dst[319:312] := Saturate8(a[383:368]) +dst[327:320] := Saturate8(b[271:256]) +dst[335:328] := Saturate8(b[287:272]) +dst[343:336] := Saturate8(b[303:288]) +dst[351:344] := Saturate8(b[319:304]) +dst[359:352] := Saturate8(b[335:320]) +dst[367:360] := Saturate8(b[351:336]) +dst[375:368] := Saturate8(b[367:352]) +dst[383:376] := Saturate8(b[383:368]) +dst[391:384] := Saturate8(a[399:384]) +dst[399:392] := Saturate8(a[415:400]) +dst[407:400] := Saturate8(a[431:416]) +dst[415:408] := Saturate8(a[447:432]) +dst[423:416] := Saturate8(a[463:448]) +dst[431:424] := Saturate8(a[479:464]) +dst[439:432] := Saturate8(a[495:480]) +dst[447:440] := Saturate8(a[511:496]) +dst[455:448] := Saturate8(b[399:384]) +dst[463:456] := Saturate8(b[415:400]) +dst[471:464] := Saturate8(b[431:416]) +dst[479:472] := Saturate8(b[447:432]) +dst[487:480] := Saturate8(b[463:448]) +dst[495:488] := Saturate8(b[479:464]) +dst[503:496] := Saturate8(b[495:480]) +dst[511:504] := Saturate8(b[511:496]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +tmp_dst[271:256] := SaturateU16(a[287:256]) +tmp_dst[287:272] := SaturateU16(a[319:288]) +tmp_dst[303:288] := SaturateU16(a[351:320]) +tmp_dst[319:304] := SaturateU16(a[383:352]) +tmp_dst[335:320] := SaturateU16(b[287:256]) +tmp_dst[351:336] := SaturateU16(b[319:288]) +tmp_dst[367:352] := SaturateU16(b[351:320]) +tmp_dst[383:368] := SaturateU16(b[383:352]) +tmp_dst[399:384] := SaturateU16(a[415:384]) +tmp_dst[415:400] := SaturateU16(a[447:416]) +tmp_dst[431:416] := SaturateU16(a[479:448]) +tmp_dst[447:432] := SaturateU16(a[511:480]) +tmp_dst[463:448] := SaturateU16(b[415:384]) +tmp_dst[479:464] := SaturateU16(b[447:416]) +tmp_dst[495:480] := SaturateU16(b[479:448]) +tmp_dst[511:496] := SaturateU16(b[511:480]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[15:0] := SaturateU16(a[31:0]) +tmp_dst[31:16] := SaturateU16(a[63:32]) +tmp_dst[47:32] := SaturateU16(a[95:64]) +tmp_dst[63:48] := SaturateU16(a[127:96]) +tmp_dst[79:64] := SaturateU16(b[31:0]) +tmp_dst[95:80] := SaturateU16(b[63:32]) +tmp_dst[111:96] := SaturateU16(b[95:64]) +tmp_dst[127:112] := SaturateU16(b[127:96]) +tmp_dst[143:128] := SaturateU16(a[159:128]) +tmp_dst[159:144] := SaturateU16(a[191:160]) +tmp_dst[175:160] := SaturateU16(a[223:192]) +tmp_dst[191:176] := SaturateU16(a[255:224]) +tmp_dst[207:192] := SaturateU16(b[159:128]) +tmp_dst[223:208] := SaturateU16(b[191:160]) +tmp_dst[239:224] := SaturateU16(b[223:192]) +tmp_dst[255:240] := SaturateU16(b[255:224]) +tmp_dst[271:256] := SaturateU16(a[287:256]) +tmp_dst[287:272] := SaturateU16(a[319:288]) +tmp_dst[303:288] := SaturateU16(a[351:320]) +tmp_dst[319:304] := SaturateU16(a[383:352]) +tmp_dst[335:320] := SaturateU16(b[287:256]) +tmp_dst[351:336] := SaturateU16(b[319:288]) +tmp_dst[367:352] := SaturateU16(b[351:320]) +tmp_dst[383:368] := SaturateU16(b[383:352]) +tmp_dst[399:384] := SaturateU16(a[415:384]) +tmp_dst[415:400] := SaturateU16(a[447:416]) +tmp_dst[431:416] := SaturateU16(a[479:448]) +tmp_dst[447:432] := SaturateU16(a[511:480]) +tmp_dst[463:448] := SaturateU16(b[415:384]) +tmp_dst[479:464] := SaturateU16(b[447:416]) +tmp_dst[495:480] := SaturateU16(b[479:448]) +tmp_dst[511:496] := SaturateU16(b[511:480]) +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := tmp_dst[i+15:i] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". + +dst[15:0] := SaturateU16(a[31:0]) +dst[31:16] := SaturateU16(a[63:32]) +dst[47:32] := SaturateU16(a[95:64]) +dst[63:48] := SaturateU16(a[127:96]) +dst[79:64] := SaturateU16(b[31:0]) +dst[95:80] := SaturateU16(b[63:32]) +dst[111:96] := SaturateU16(b[95:64]) +dst[127:112] := SaturateU16(b[127:96]) +dst[143:128] := SaturateU16(a[159:128]) +dst[159:144] := SaturateU16(a[191:160]) +dst[175:160] := SaturateU16(a[223:192]) +dst[191:176] := SaturateU16(a[255:224]) +dst[207:192] := SaturateU16(b[159:128]) +dst[223:208] := SaturateU16(b[191:160]) +dst[239:224] := SaturateU16(b[223:192]) +dst[255:240] := SaturateU16(b[255:224]) +dst[271:256] := SaturateU16(a[287:256]) +dst[287:272] := SaturateU16(a[319:288]) +dst[303:288] := SaturateU16(a[351:320]) +dst[319:304] := SaturateU16(a[383:352]) +dst[335:320] := SaturateU16(b[287:256]) +dst[351:336] := SaturateU16(b[319:288]) +dst[367:352] := SaturateU16(b[351:320]) +dst[383:368] := SaturateU16(b[383:352]) +dst[399:384] := SaturateU16(a[415:384]) +dst[415:400] := SaturateU16(a[447:416]) +dst[431:416] := SaturateU16(a[479:448]) +dst[447:432] := SaturateU16(a[511:480]) +dst[463:448] := SaturateU16(b[415:384]) +dst[479:464] := SaturateU16(b[447:416]) +dst[495:480] := SaturateU16(b[479:448]) +dst[511:496] := SaturateU16(b[511:480]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +tmp_dst[263:256] := SaturateU8(a[271:256]) +tmp_dst[271:264] := SaturateU8(a[287:272]) +tmp_dst[279:272] := SaturateU8(a[303:288]) +tmp_dst[287:280] := SaturateU8(a[319:304]) +tmp_dst[295:288] := SaturateU8(a[335:320]) +tmp_dst[303:296] := SaturateU8(a[351:336]) +tmp_dst[311:304] := SaturateU8(a[367:352]) +tmp_dst[319:312] := SaturateU8(a[383:368]) +tmp_dst[327:320] := SaturateU8(b[271:256]) +tmp_dst[335:328] := SaturateU8(b[287:272]) +tmp_dst[343:336] := SaturateU8(b[303:288]) +tmp_dst[351:344] := SaturateU8(b[319:304]) +tmp_dst[359:352] := SaturateU8(b[335:320]) +tmp_dst[367:360] := SaturateU8(b[351:336]) +tmp_dst[375:368] := SaturateU8(b[367:352]) +tmp_dst[383:376] := SaturateU8(b[383:368]) +tmp_dst[391:384] := SaturateU8(a[399:384]) +tmp_dst[399:392] := SaturateU8(a[415:400]) +tmp_dst[407:400] := SaturateU8(a[431:416]) +tmp_dst[415:408] := SaturateU8(a[447:432]) +tmp_dst[423:416] := SaturateU8(a[463:448]) +tmp_dst[431:424] := SaturateU8(a[479:464]) +tmp_dst[439:432] := SaturateU8(a[495:480]) +tmp_dst[447:440] := SaturateU8(a[511:496]) +tmp_dst[455:448] := SaturateU8(b[399:384]) +tmp_dst[463:456] := SaturateU8(b[415:400]) +tmp_dst[471:464] := SaturateU8(b[431:416]) +tmp_dst[479:472] := SaturateU8(b[447:432]) +tmp_dst[487:480] := SaturateU8(b[463:448]) +tmp_dst[495:488] := SaturateU8(b[479:464]) +tmp_dst[503:496] := SaturateU8(b[495:480]) +tmp_dst[511:504] := SaturateU8(b[511:496]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[7:0] := SaturateU8(a[15:0]) +tmp_dst[15:8] := SaturateU8(a[31:16]) +tmp_dst[23:16] := SaturateU8(a[47:32]) +tmp_dst[31:24] := SaturateU8(a[63:48]) +tmp_dst[39:32] := SaturateU8(a[79:64]) +tmp_dst[47:40] := SaturateU8(a[95:80]) +tmp_dst[55:48] := SaturateU8(a[111:96]) +tmp_dst[63:56] := SaturateU8(a[127:112]) +tmp_dst[71:64] := SaturateU8(b[15:0]) +tmp_dst[79:72] := SaturateU8(b[31:16]) +tmp_dst[87:80] := SaturateU8(b[47:32]) +tmp_dst[95:88] := SaturateU8(b[63:48]) +tmp_dst[103:96] := SaturateU8(b[79:64]) +tmp_dst[111:104] := SaturateU8(b[95:80]) +tmp_dst[119:112] := SaturateU8(b[111:96]) +tmp_dst[127:120] := SaturateU8(b[127:112]) +tmp_dst[135:128] := SaturateU8(a[143:128]) +tmp_dst[143:136] := SaturateU8(a[159:144]) +tmp_dst[151:144] := SaturateU8(a[175:160]) +tmp_dst[159:152] := SaturateU8(a[191:176]) +tmp_dst[167:160] := SaturateU8(a[207:192]) +tmp_dst[175:168] := SaturateU8(a[223:208]) +tmp_dst[183:176] := SaturateU8(a[239:224]) +tmp_dst[191:184] := SaturateU8(a[255:240]) +tmp_dst[199:192] := SaturateU8(b[143:128]) +tmp_dst[207:200] := SaturateU8(b[159:144]) +tmp_dst[215:208] := SaturateU8(b[175:160]) +tmp_dst[223:216] := SaturateU8(b[191:176]) +tmp_dst[231:224] := SaturateU8(b[207:192]) +tmp_dst[239:232] := SaturateU8(b[223:208]) +tmp_dst[247:240] := SaturateU8(b[239:224]) +tmp_dst[255:248] := SaturateU8(b[255:240]) +tmp_dst[263:256] := SaturateU8(a[271:256]) +tmp_dst[271:264] := SaturateU8(a[287:272]) +tmp_dst[279:272] := SaturateU8(a[303:288]) +tmp_dst[287:280] := SaturateU8(a[319:304]) +tmp_dst[295:288] := SaturateU8(a[335:320]) +tmp_dst[303:296] := SaturateU8(a[351:336]) +tmp_dst[311:304] := SaturateU8(a[367:352]) +tmp_dst[319:312] := SaturateU8(a[383:368]) +tmp_dst[327:320] := SaturateU8(b[271:256]) +tmp_dst[335:328] := SaturateU8(b[287:272]) +tmp_dst[343:336] := SaturateU8(b[303:288]) +tmp_dst[351:344] := SaturateU8(b[319:304]) +tmp_dst[359:352] := SaturateU8(b[335:320]) +tmp_dst[367:360] := SaturateU8(b[351:336]) +tmp_dst[375:368] := SaturateU8(b[367:352]) +tmp_dst[383:376] := SaturateU8(b[383:368]) +tmp_dst[391:384] := SaturateU8(a[399:384]) +tmp_dst[399:392] := SaturateU8(a[415:400]) +tmp_dst[407:400] := SaturateU8(a[431:416]) +tmp_dst[415:408] := SaturateU8(a[447:432]) +tmp_dst[423:416] := SaturateU8(a[463:448]) +tmp_dst[431:424] := SaturateU8(a[479:464]) +tmp_dst[439:432] := SaturateU8(a[495:480]) +tmp_dst[447:440] := SaturateU8(a[511:496]) +tmp_dst[455:448] := SaturateU8(b[399:384]) +tmp_dst[463:456] := SaturateU8(b[415:400]) +tmp_dst[471:464] := SaturateU8(b[431:416]) +tmp_dst[479:472] := SaturateU8(b[447:432]) +tmp_dst[487:480] := SaturateU8(b[463:448]) +tmp_dst[495:488] := SaturateU8(b[479:464]) +tmp_dst[503:496] := SaturateU8(b[495:480]) +tmp_dst[511:504] := SaturateU8(b[511:496]) +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := tmp_dst[i+7:i] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Miscellaneous + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(a[79:64]) +dst[47:40] := SaturateU8(a[95:80]) +dst[55:48] := SaturateU8(a[111:96]) +dst[63:56] := SaturateU8(a[127:112]) +dst[71:64] := SaturateU8(b[15:0]) +dst[79:72] := SaturateU8(b[31:16]) +dst[87:80] := SaturateU8(b[47:32]) +dst[95:88] := SaturateU8(b[63:48]) +dst[103:96] := SaturateU8(b[79:64]) +dst[111:104] := SaturateU8(b[95:80]) +dst[119:112] := SaturateU8(b[111:96]) +dst[127:120] := SaturateU8(b[127:112]) +dst[135:128] := SaturateU8(a[143:128]) +dst[143:136] := SaturateU8(a[159:144]) +dst[151:144] := SaturateU8(a[175:160]) +dst[159:152] := SaturateU8(a[191:176]) +dst[167:160] := SaturateU8(a[207:192]) +dst[175:168] := SaturateU8(a[223:208]) +dst[183:176] := SaturateU8(a[239:224]) +dst[191:184] := SaturateU8(a[255:240]) +dst[199:192] := SaturateU8(b[143:128]) +dst[207:200] := SaturateU8(b[159:144]) +dst[215:208] := SaturateU8(b[175:160]) +dst[223:216] := SaturateU8(b[191:176]) +dst[231:224] := SaturateU8(b[207:192]) +dst[239:232] := SaturateU8(b[223:208]) +dst[247:240] := SaturateU8(b[239:224]) +dst[255:248] := SaturateU8(b[255:240]) +dst[263:256] := SaturateU8(a[271:256]) +dst[271:264] := SaturateU8(a[287:272]) +dst[279:272] := SaturateU8(a[303:288]) +dst[287:280] := SaturateU8(a[319:304]) +dst[295:288] := SaturateU8(a[335:320]) +dst[303:296] := SaturateU8(a[351:336]) +dst[311:304] := SaturateU8(a[367:352]) +dst[319:312] := SaturateU8(a[383:368]) +dst[327:320] := SaturateU8(b[271:256]) +dst[335:328] := SaturateU8(b[287:272]) +dst[343:336] := SaturateU8(b[303:288]) +dst[351:344] := SaturateU8(b[319:304]) +dst[359:352] := SaturateU8(b[335:320]) +dst[367:360] := SaturateU8(b[351:336]) +dst[375:368] := SaturateU8(b[367:352]) +dst[383:376] := SaturateU8(b[383:368]) +dst[391:384] := SaturateU8(a[399:384]) +dst[399:392] := SaturateU8(a[415:400]) +dst[407:400] := SaturateU8(a[431:416]) +dst[415:408] := SaturateU8(a[447:432]) +dst[423:416] := SaturateU8(a[463:448]) +dst[431:424] := SaturateU8(a[479:464]) +dst[439:432] := SaturateU8(a[495:480]) +dst[447:440] := SaturateU8(a[511:496]) +dst[455:448] := SaturateU8(b[399:384]) +dst[463:456] := SaturateU8(b[415:400]) +dst[471:464] := SaturateU8(b[431:416]) +dst[479:472] := SaturateU8(b[447:432]) +dst[487:480] := SaturateU8(b[463:448]) +dst[495:488] := SaturateU8(b[479:464]) +dst[503:496] := SaturateU8(b[495:480]) +dst[511:504] := SaturateU8(b[511:496]) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := 16*j + l := 8*j + dst[l+7:l] := Saturate8(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + l := j*16 + dst[l+15:l] := SignExtend16(a[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := SignExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 31 + i := 16*j + l := 8*j + dst[l+7:l] := SaturateU8(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 31 + i := 16*j + l := 8*j + dst[l+7:l] := Truncate8(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) + FI +ENDFOR + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := 16*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+15:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + l := j*16 + dst[l+15:l] := ZeroExtend16(a[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + l := j*16 + IF k[j] + dst[l+15:l] := ZeroExtend16(a[i+7:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Convert +
+ + + + + + Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Set +
+ + + + + Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[7:0] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Set +
+ + + + + + Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Set +
+ + + + + Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[15:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Set +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 63 + i := j*8 + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 31 + i := j*16 + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 63 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 31 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 63 + i := j*8 + IF k1[j] + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 63 + i := j*8 + k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 31 + i := j*16 + IF k1[j] + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 31 + i := j*16 + k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Compare +
+ + + + + Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) +dst[255:128] := a[255:128] << (tmp*8) +dst[383:256] := a[383:256] << (tmp*8) +dst[511:384] := a[511:384] << (tmp*8) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) +dst[255:128] := a[255:128] >> (tmp*8) +dst[383:256] := a[383:256] >> (tmp*8) +dst[511:384] := a[511:384] >> (tmp*8) +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[i+15:i] < 16 + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512BW +
immintrin.h
+ Shift +
+ + + + + Add 32-bit masks in "a" and "b", and store the result in "k". + +k[31:0] := a[31:0] + b[31:0] +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Add 64-bit masks in "a" and "b", and store the result in "k". + +k[63:0] := a[63:0] + b[63:0] +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 32-bit masks "a" and "b", and store the result in "k". + +k[31:0] := a[31:0] AND b[31:0] +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 64-bit masks "a" and "b", and store the result in "k". + +k[63:0] := a[63:0] AND b[63:0] +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 32-bit masks "a" and then AND with "b", and store the result in "k". + +k[31:0] := (NOT a[31:0]) AND b[31:0] +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 64-bit masks "a" and then AND with "b", and store the result in "k". + +k[63:0] := (NOT a[63:0]) AND b[63:0] +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 32-bit mask "a", and store the result in "k". + +k[31:0] := NOT a[31:0] +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 64-bit mask "a", and store the result in "k". + +k[63:0] := NOT a[63:0] +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 32-bit masks "a" and "b", and store the result in "k". + +k[31:0] := a[31:0] OR b[31:0] +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 64-bit masks "a" and "b", and store the result in "k". + +k[63:0] := a[63:0] OR b[63:0] +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 32-bit masks "a" and "b", and store the result in "k". + +k[31:0] := NOT (a[31:0] XOR b[31:0]) +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 64-bit masks "a" and "b", and store the result in "k". + +k[63:0] := NOT (a[63:0] XOR b[63:0]) +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 32-bit masks "a" and "b", and store the result in "k". + +k[31:0] := a[31:0] XOR b[31:0] +k[MAX:32] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 64-bit masks "a" and "b", and store the result in "k". + +k[63:0] := a[63:0] XOR b[63:0] +k[MAX:64] := 0 + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 32-bit mask "a" left by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 31 + k[31:0] := a[31:0] << count[7:0] +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 64-bit mask "a" left by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 63 + k[63:0] := a[63:0] << count[7:0] +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 32-bit mask "a" right by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 31 + k[31:0] := a[31:0] >> count[7:0] +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 64-bit mask "a" right by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 63 + k[63:0] := a[63:0] >> count[7:0] +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". + +tmp[31:0] := a[31:0] OR b[31:0] +IF tmp[31:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +IF tmp[31:0] == 0xFFFFFFFF + MEM[all_ones+7:all_ones] := 1 +ELSE + MEM[all_ones+7:all_ones] := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[31:0] := a[31:0] OR b[31:0] +IF tmp[31:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". + +tmp[31:0] := a[31:0] OR b[31:0] +IF tmp[31:0] == 0xFFFFFFFF + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". + +tmp[63:0] := a[63:0] OR b[63:0] +IF tmp[63:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +IF tmp[7:0] == 0xFFFFFFFFFFFFFFFF + MEM[all_ones+7:all_ones] := 1 +ELSE + MEM[all_ones+7:all_ones] := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[63:0] := a[63:0] OR b[63:0] +IF tmp[63:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". + +tmp[63:0] := a[63:0] OR b[63:0] +IF tmp[63:0] == 0xFFFFFFFFFFFFFFFF + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". + +tmp1[31:0] := a[31:0] AND b[31:0] +IF tmp1[31:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +tmp2[31:0] := (NOT a[31:0]) AND b[31:0] +IF tmp2[31:0] == 0x0 + MEM[and_not+7:and_not] := 1 +ELSE + MEM[and_not+7:and_not] := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". + +tmp[31:0] := a[31:0] AND b[31:0] +IF tmp[31:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 32-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[31:0] := (NOT a[31:0]) AND b[31:0] +IF tmp[31:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". + +tmp1[63:0] := a[63:0] AND b[63:0] +IF tmp1[63:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +tmp2[63:0] := (NOT a[63:0]) AND b[63:0] +IF tmp2[63:0] == 0x0 + MEM[and_not+7:and_not] := 1 +ELSE + MEM[and_not+7:and_not] := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". + +tmp[63:0] := a[63:0] AND b[63:0] +IF tmp[63:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 64-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[63:0] := (NOT a[63:0]) AND b[63:0] +IF tmp[63:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + Convert 32-bit mask "a" into an integer value, and store the result in "dst". + +dst := ZeroExtend32(a[31:0]) + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + Convert 64-bit mask "a" into an integer value, and store the result in "dst". + +dst := ZeroExtend64(a[63:0]) + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + Convert integer value "a" into an 32-bit mask, and store the result in "k". + +k := ZeroExtend32(a[31:0]) + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + Convert integer value "a" into an 64-bit mask, and store the result in "k". + +k := ZeroExtend64(a[63:0]) + + + AVX512BW +
immintrin.h
+ Mask +
+ + + + + + Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ZeroExtend64(k[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ZeroExtend64(k[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ZeroExtend32(k[15:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ZeroExtend32(k[15:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 7 + i := j*32 + FOR k := 0 to j-1 + m := k*32 + dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + FOR l := 0 to j-1 + m := l*32 + dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + FOR l := 0 to j-1 + m := l*32 + dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 3 + i := j*32 + FOR k := 0 to j-1 + m := k*32 + dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + FOR l := 0 to j-1 + m := l*32 + dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + FOR l := 0 to j-1 + m := l*32 + dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 3 + i := j*64 + FOR k := 0 to j-1 + m := k*64 + dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 3 + i := j*64 + IF k[j] + FOR l := 0 to j-1 + m := l*64 + dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 3 + i := j*64 + IF k[j] + FOR l := 0 to j-1 + m := l*64 + dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 1 + i := j*64 + FOR k := 0 to j-1 + m := k*64 + dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 1 + i := j*64 + IF k[j] + FOR l := 0 to j-1 + m := l*64 + dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 1 + i := j*64 + IF k[j] + FOR l := 0 to j-1 + m := l*64 + dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Compare +
+ + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512CD + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ZeroExtend64(k[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ZeroExtend32(k[15:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Swizzle +
+ + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 15 + i := j*32 + FOR k := 0 to j-1 + m := k*32 + dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Compare +
+ + + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + FOR l := 0 to j-1 + m := l*32 + dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Compare +
+ + + + + Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + FOR l := 0 to j-1 + m := l*32 + dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 + ENDFOR + dst[i+31:i+j] := 0 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Compare +
+ + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 7 + i := j*64 + FOR k := 0 to j-1 + m := k*64 + dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Compare +
+ + + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 7 + i := j*64 + IF k[j] + FOR l := 0 to j-1 + m := l*64 + dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Compare +
+ + + + + Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". + +FOR j := 0 to 7 + i := j*64 + IF k[j] + FOR l := 0 to j-1 + m := l*64 + dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 + ENDFOR + dst[i+63:i+j] := 0 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Compare +
+ + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Bit Manipulation +
+ + + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Bit Manipulation +
+ + + + + Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp := 31 + dst[i+31:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+31:i] := dst[i+31:i] + 1 + OD + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Bit Manipulation +
+ + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Bit Manipulation +
+ + + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Bit Manipulation +
+ + + + + Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp := 63 + dst[i+63:i] := 0 + DO WHILE (tmp >= 0 AND a[i+tmp] == 0) + tmp := tmp - 1 + dst[i+63:i] := dst[i+63:i] + 1 + OD + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512CD +
immintrin.h
+ Bit Manipulation +
+ + + + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Logical +
+ + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. + +FOR j := 0 to 7 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. + +FOR j := 0 to 3 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 3 + i := j*64 + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 1 + i := j*64 + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) +ENDFOR +k[MAX:2] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 7 + i := j*32 + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) +ENDFOR +k[MAX:8] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 3 + i := j*32 + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE imm8[0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE imm8[0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". + +FOR j := 0 to 7 + i := j*32 + IF a[i+31] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". + +FOR j := 0 to 3 + i := j*32 + IF a[i+31] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := 0xFFFFFFFF + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := 0xFFFFFFFF + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := 0xFFFFFFFFFFFFFFFF + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := 0xFFFFFFFFFFFFFFFF + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". + +FOR j := 0 to 3 + i := j*64 + IF a[i+63] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". + +FOR j := 0 to 1 + i := j*64 + IF a[i+63] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + RETURN tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Logical +
+ + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 2)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 8)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 2)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[1:0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] +ESAC +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[1:0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] +ESAC +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 7 + i := j*64 + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) +ENDFOR +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR j := 0 to 15 + i := j*32 + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) +ENDFOR +k[MAX:16] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k". + [fpclass_note] + k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) +k[MAX:1] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + [fpclass_note] + IF k1[0] + k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k. + [fpclass_note] + k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) +k[MAX:1] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + [fpclass_note] + IF k1[0] + k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[0]) OF +0: dst[255:0] := b[255:0] +1: dst[511:256] := b[255:0] +ESAC +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE imm8[1:0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +2: dst[383:256] := b[127:0] +3: dst[511:384] := b[127:0] +ESAC +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE imm8[0] OF +0: dst[255:0] := b[255:0] +1: dst[511:256] := b[255:0] +ESAC +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE imm8[1:0] OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +2: dst[383:256] := b[127:0] +3: dst[511:384] := b[127:0] +ESAC +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". + +FOR j := 0 to 15 + i := j*32 + IF a[i+31] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := 0xFFFFFFFF + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := 0xFFFFFFFFFFFFFFFF + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". + +FOR j := 0 to 7 + i := j*64 + IF a[i+63] + k[j] := 1 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[63:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] + 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] + 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] + 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) + 1: dst[63:0] := tmp[63:0] + 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) + 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) + ESAC + + RETURN dst +} +dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[31:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[31:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[31:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[31:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +IF k[0] + dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. + imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] + +DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { + CASE opCtl[1:0] OF + 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] + 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] + 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] + 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] + ESAC + + CASE signSelCtl[1:0] OF + 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) + 1: dst[31:0] := tmp[31:0] + 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) + 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) + ESAC + + RETURN dst +} +dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + tmp[63:0] := src1[63:0] - tmp[63:0] + IF IsInf(tmp[63:0]) + tmp[63:0] := FP64(0.0) + FI + RETURN tmp[63:0] +} +dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + tmp[31:0] := src1[31:0] - tmp[31:0] + IF IsInf(tmp[31:0]) + tmp[31:0] := FP32(0.0) + FI + RETURN tmp[31:0] +} +dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512DQ +
immintrin.h
+ Miscellaneous +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512DQ +
immintrin.h
+ Convert +
+ + + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := a[i+63:i] * b[i+63:i] + dst[i+63:i] := tmp[63:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512DQ +
immintrin.h
+ Arithmetic +
+ + + + + Add 8-bit masks in "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] + b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Add 16-bit masks in "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] + b[15:0] +k[MAX:16] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] AND b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 8-bit masks "a" and then AND with "b", and store the result in "k". + +k[7:0] := (NOT a[7:0]) AND b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 8-bit mask "a", and store the result in "k". + +k[7:0] := NOT a[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] OR b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := NOT (a[7:0] XOR b[7:0]) +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 8-bit masks "a" and "b", and store the result in "k". + +k[7:0] := a[7:0] XOR b[7:0] +k[MAX:8] := 0 + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 8-bit mask "a" left by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 7 + k[7:0] := a[7:0] << count[7:0] +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 8-bit mask "a" right by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 7 + k[7:0] := a[7:0] >> count[7:0] +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". + +tmp[7:0] := a[7:0] OR b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +IF tmp[7:0] == 0xFF + MEM[all_ones+7:all_ones] := 1 +ELSE + MEM[all_ones+7:all_ones] := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[7:0] := a[7:0] OR b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". + +tmp[7:0] := a[7:0] OR b[7:0] +IF tmp[7:0] == 0xFF + dst := 1 +ELSE + dst := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". + +tmp1[7:0] := a[7:0] AND b[7:0] +IF tmp1[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +tmp2[7:0] := (NOT a[7:0]) AND b[7:0] +IF tmp2[7:0] == 0x0 + MEM[and_not+7:and_not] := 1 +ELSE + MEM[and_not+7:and_not] := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". + +tmp[7:0] := a[7:0] AND b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 8-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[7:0] := (NOT a[7:0]) AND b[7:0] +IF tmp[7:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". + +tmp1[15:0] := a[15:0] AND b[15:0] +IF tmp1[15:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +tmp2[15:0] := (NOT a[15:0]) AND b[15:0] +IF tmp2[15:0] == 0x0 + MEM[and_not+7:and_not] := 1 +ELSE + MEM[and_not+7:and_not] := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". + +tmp[15:0] := a[15:0] AND b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 16-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[15:0] := (NOT a[15:0]) AND b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Convert 8-bit mask "a" into an integer value, and store the result in "dst". + +dst := ZeroExtend32(a[7:0]) + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Convert integer value "a" into an 8-bit mask, and store the result in "k". + +k := a[7:0] + + + AVX512DQ +
immintrin.h
+ Mask +
+ + + + Load 8-bit mask from memory into "k". + +k[7:0] := MEM[mem_addr+7:mem_addr] + + + AVX512DQ +
immintrin.h
+ Load +
+ + + + + Store 8-bit mask from "a" into memory. + +MEM[mem_addr+7:mem_addr] := a[7:0] + + + AVX512DQ +
immintrin.h
+ Store +
+ + + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ACOS(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ACOS(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ACOS(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ACOS(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ACOSH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ACOSH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ACOSH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ACOSH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ASIN(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ASIN(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ASIN(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ASIN(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ASINH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ASINH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ASINH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ASINH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ATAN(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ATAN(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ATAN(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ATAN(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ATANH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ATANH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ATANH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ATANH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := COS(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := COS(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := COSD(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := COSD(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := COSD(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := COSD(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := COSH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := COSH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := COSH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := COSH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SIN(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SIN(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SINH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SINH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SINH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SINH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SIND(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SIND(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SIND(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SIND(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := TAN(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := TAN(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TAN(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := TAN(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := TAND(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := TAND(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TAND(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := TAND(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := TANH(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := TANH(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TANH(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := TANH(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) + ELSE + dst[i+63:i] := sin_src[i+63:i] + MEM[mem_addr+i+63:mem_addr+i] := cos_src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) + ELSE + dst[i+31:i] := sin_src[i+31:i] + MEM[mem_addr+i+31:mem_addr+i] := cos_src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Trigonometry +
+ + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := CubeRoot(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := CubeRoot(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := CubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := CubeRoot(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(10.0, a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POW(10.0, a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(2.0, a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POW(2.0, a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POW(e, a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POW(FP32(e), a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := InvSQRT(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := InvSQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := InvSQRT(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := InvSQRT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(1.0 + a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LOG(1.0 + a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(1.0 + a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LOG(1.0 + a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LOG(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LOG(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := CDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := CDFNormal(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := CDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := CDFNormal(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ERF(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ERF(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ERF(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ERF(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Probability/Statistics +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := CEIL(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := CEIL(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FLOOR(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FLOOR(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := NearbyInt(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := NearbyInt(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed single-precision floating-point elements in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := NearbyInt(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := NearbyInt(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RoundToNearestEven(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RoundToNearestEven(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RoundToNearestEven(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RoundToNearestEven(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ROUND(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := TRUNCATE(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := TRUNCATE(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := TRUNCATE(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := TRUNCATE(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 63 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 31 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 7 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 63 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 31 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 63 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 31 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 7 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 63 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 31 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ABS(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ABS(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] :=0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst". + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (32*imm8[2:0]) +dst[255:0] := temp[255:0] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (32*imm8[2:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (32*imm8[2:0]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst". + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (32*imm8[1:0]) +dst[127:0] := temp[127:0] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (32*imm8[1:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (32*imm8[1:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst". + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (64*imm8[1:0]) +dst[255:0] := temp[255:0] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (64*imm8[1:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +temp[511:256] := a[255:0] +temp[255:0] := b[255:0] +temp[511:0] := temp[511:0] >> (64*imm8[1:0]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst". + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (64*imm8[0]) +dst[127:0] := temp[127:0] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (64*imm8[0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +temp[255:128] := a[127:0] +temp[127:0] := b[127:0] +temp[255:0] := temp[255:0] >> (64*imm8[0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 64 +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 64 +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 64 +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 64 +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 32 +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 32 +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +ESAC +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN: j := 0 + SNAN_TOKEN: j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[255:0] := a[255:0] +CASE (imm8[0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +ESAC +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[255:0] := a[255:0] +CASE (imm8[0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +ESAC +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 32 +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 32 +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 64 +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 64 +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 64 +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 64 +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + off := idx[i+2:i]*32 + dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + off := idx[i+1:i]*32 + dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + off := idx[i+1:i]*64 + dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + IF k[j] + dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + off := idx[i]*64 + dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:128] := 0 + + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + id := idx[i+1:i]*64 + dst[i+63:i] := a[id+63:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". + +FOR j := 0 to 7 + i := j*32 + id := idx[i+2:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + id := idx[i+1:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + id := idx[i+1:i]*64 + dst[i+63:i] := a[id+63:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". + +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst.m128[0] := a.m128[imm8[0]] +tmp_dst.m128[1] := b.m128[imm8[1]] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". + +dst.m128[0] := a.m128[imm8[0]] +dst.m128[1] := b.m128[imm8[1]] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 3 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*32 + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 3 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 1 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 7 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 3 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 3 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 1 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 7 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 7 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 3 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 3 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 3 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 3 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:4] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 1 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 1 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:2] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 64 +m := base_addr +FOR j := 0 to 3 + i := j*64 + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 64 +m := base_addr +FOR j := 0 to 1 + i := j*64 + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 32 +m := base_addr +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 32 +m := base_addr +FOR j := 0 to 3 + i := j*32 + IF k[j] + MEM[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 32 +m := base_addr +FOR j := 0 to 7 + i := j*32 + IF k[j] + MEM[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 32 +m := base_addr +FOR j := 0 to 3 + i := j*32 + IF k[j] + MEM[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 64 +m := base_addr +FOR j := 0 to 3 + i := j*64 + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 64 +m := base_addr +FOR j := 0 to 1 + i := j*64 + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512F + AVX512VL +
immintrin.h
+ Store +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF k[j] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + ELSE + dst[m+63:m] := src[m+63:m] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF k[j] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + ELSE + dst[m+63:m] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF k[j] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + ELSE + dst[m+63:m] := src[m+63:m] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF k[j] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + ELSE + dst[m+63:m] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 7 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 7 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 7 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 7 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 3 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 3 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 3 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +FOR j := 0 to 3 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+31:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 32*j + dst[k+31:k] := Truncate32(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Truncate32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Truncate32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[k+31:k] := Truncate32(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Truncate32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Truncate32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+31:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+63:i]) +ENDFOR +dst[MAX:16] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:16] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:16] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 32*j + dst[k+31:k] := Saturate32(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[k+31:k] := Saturate32(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + l := j*16 + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + l := j*16 + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+31:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+31:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+63:i]) +ENDFOR +dst[MAX:16] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:16] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:16] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 32*j + dst[k+31:k] := SaturateU32(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[k+31:k] := SaturateU32(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := 64*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+63:i]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + FI +ENDFOR + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 3 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 1 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Load +
+ + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Move +
+ + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 7 + i := j*32 + IF k[j] + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 7 + i := j*32 + IF k[j] + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 7 + i := j*32 + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 3 + i := j*32 + IF k[j] + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 3 + i := j*32 + IF k[j] + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 3 + i := j*32 + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 3 + i := j*64 + IF k[j] + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 3 + i := j*64 + IF k[j] + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 3 + i := j*64 + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 1 + i := j*64 + IF k[j] + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 1 + i := j*64 + IF k[j] + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 1 + i := j*64 + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Logical +
+ + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Set +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := ShiftRows(a[i+127:i]) + a[i+127:i] := SubBytes(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F + VAES +
immintrin.h
+ Cryptography +
+ + + + + Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := ShiftRows(a[i+127:i]) + a[i+127:i] := SubBytes(a[i+127:i]) + a[i+127:i] := MixColumns(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F + VAES +
immintrin.h
+ Cryptography +
+ + + + + Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := InvShiftRows(a[i+127:i]) + a[i+127:i] := InvSubBytes(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F + VAES +
immintrin.h
+ Cryptography +
+ + + + + Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". + FOR j := 0 to 3 + i := j*128 + a[i+127:i] := InvShiftRows(a[i+127:i]) + a[i+127:i] := InvSubBytes(a[i+127:i]) + a[i+127:i] := InvMixColumns(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F + VAES +
immintrin.h
+ Cryptography +
+ + + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := a[63:0] + b[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] + b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := a[31:0] + b[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] + b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + dst[i+63:i] := a[i+63:i] / b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", =and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := 64*j + dst[i+63:i] := a[i+63:i] / b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := 64*j + IF k[j] + dst[i+63:i] := a[i+63:i] / b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := a[i+31:i] / b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := a[i+31:i] / b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := a[i+31:i] / b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := a[63:0] / b[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] / b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] / b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] / b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] / b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := a[31:0] / b[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] / b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] / b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] / b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] / b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := c[63:0] +FI +dst[127:64] := c[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := c[31:0] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] * b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := a[63:0] * b[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] * b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := a[31:0] * b[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+31:i] * b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] - b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] - b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := a[63:0] - b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := a[63:0] - b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := a[63:0] - b[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] - b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] - b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := a[31:0] - b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := a[31:0] - b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := a[31:0] - b[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 16-bit mask from "a" into memory. + +MEM[mem_addr+15:mem_addr] := a[15:0] + + + AVX512F +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 64 +m := base_addr +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 32 +m := base_addr +FOR j := 0 to 15 + i := j*32 + IF k[j] + MEM[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits of integer data from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + MEM[mem_addr+63:mem_addr] := a[63:0] +FI + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + MEM[mem_addr+31:mem_addr] := a[31:0] +FI + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 32 +m := base_addr +FOR j := 0 to 15 + i := j*32 + IF k[j] + MEM[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 64 +m := base_addr +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Store +
+ + + + Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 16-bit mask from memory into "k". + +k[15:0] := MEM[mem_addr+15:mem_addr] + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits of integer data from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[63:0] := MEM[mem_addr+63:mem_addr] +ELSE + dst[63:0] := src[63:0] +FI +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[63:0] := MEM[mem_addr+63:mem_addr] +ELSE + dst[63:0] := 0 +FI +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[31:0] := MEM[mem_addr+31:mem_addr] +ELSE + dst[31:0] := src[31:0] +FI +dst[MAX:32] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +IF k[0] + dst[31:0] := MEM[mem_addr+31:mem_addr] +ELSE + dst[31:0] := 0 +FI +dst[MAX:32] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 7 + i := j*64 + m := j*64 + IF k[j] + addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] AND b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". + +k[15:0] := (NOT a[15:0]) AND b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". + +k[15:0] := NOT a[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] OR b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := NOT (a[15:0] XOR b[15:0]) +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] XOR b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 16-bit mask "a" left by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 15 + k[15:0] := a[15:0] << count[7:0] +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Shift the bits of 16-bit mask "a" right by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". + +k[MAX:0] := 0 +IF count[7:0] <= 15 + k[15:0] := a[15:0] >> count[7:0] +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + + Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". + +tmp[15:0] := a[15:0] OR b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI +IF tmp[15:0] == 0xFFFF + MEM[all_ones+7:all_ones] := 1 +ELSE + MEM[all_ones+7:all_ones] := 0 +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". + +tmp[15:0] := a[15:0] OR b[15:0] +IF tmp[15:0] == 0x0 + dst := 1 +ELSE + dst := 0 +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". + +tmp[15:0] := a[15:0] OR b[15:0] +IF tmp[15:0] == 0xFFFF + dst := 1 +ELSE + dst := 0 +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Convert 16-bit mask "a" into an integer value, and store the result in "dst". + +dst := ZeroExtend32(a[15:0]) + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Convert integer value "a" into an 16-bit mask, and store the result in "k". + +k := ZeroExtend16(a[15:0]) + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". + +k[15:0] := (NOT a[15:0]) AND b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] AND b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Copy 16-bit mask "a" to "k". + +k[15:0] := a[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". + +k[15:0] := NOT a[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] OR b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k". + +k[7:0] := b[7:0] +k[15:8] := a[7:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := NOT (a[15:0] XOR b[15:0]) +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". + +k[15:0] := a[15:0] XOR b[15:0] +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0. + dst[15:0] := k1[15:0] | k2[15:0] +IF dst == 0 + SetZF() +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's. + dst[15:0] := k1[15:0] | k2[15:0] +IF PopCount(dst[15:0]) == 16 + SetCF() +FI + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Converts bit mask "k1" into an integer value, storing the results in "dst". + +dst := ZeroExtend32(k1) + + + AVX512F +
immintrin.h
+ Mask +
+ + + + Converts integer "mask" into bitmask, storing the result in "dst". + +dst := mask[15:0] + + + AVX512F +
immintrin.h
+ Mask +
+ + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst". + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) +dst[511:0] := temp[511:0] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := temp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +dst[127:64] := b[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +dst[127:64] := b[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +IF k[0] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := b[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +IF k[0] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +ELSE + dst[63:0] := a[63:0] +FI +dst[127:64] := b[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +IF k[0] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := b[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { + tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] + CASE(tsrc[63:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[63:0] := src1[63:0] + 1 : dest[63:0] := tsrc[63:0] + 2 : dest[63:0] := QNaN(tsrc[63:0]) + 3 : dest[63:0] := QNAN_Indefinite + 4 : dest[63:0] := -INF + 5 : dest[63:0] := +INF + 6 : dest[63:0] := tsrc.sign? -INF : +INF + 7 : dest[63:0] := -0 + 8 : dest[63:0] := +0 + 9 : dest[63:0] := -1 + 10: dest[63:0] := +1 + 11: dest[63:0] := 1/2 + 12: dest[63:0] := 90.0 + 13: dest[63:0] := PI/2 + 14: dest[63:0] := MAX_FLOAT + 15: dest[63:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[63:0] +} +IF k[0] + dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := b[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +dst[127:32] := b[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +dst[127:32] := b[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := b[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := a[31:0] +FI +dst[127:32] := b[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + [sae_note] + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := b[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. + enum TOKEN_TYPE { + QNAN_TOKEN := 0, \ + SNAN_TOKEN := 1, \ + ZERO_VALUE_TOKEN := 2, \ + ONE_VALUE_TOKEN := 3, \ + NEG_INF_TOKEN := 4, \ + POS_INF_TOKEN := 5, \ + NEG_VALUE_TOKEN := 6, \ + POS_VALUE_TOKEN := 7 +} +DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { + tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] + CASE(tsrc[31:0]) OF + QNAN_TOKEN:j := 0 + SNAN_TOKEN:j := 1 + ZERO_VALUE_TOKEN: j := 2 + ONE_VALUE_TOKEN: j := 3 + NEG_INF_TOKEN: j := 4 + POS_INF_TOKEN: j := 5 + NEG_VALUE_TOKEN: j := 6 + POS_VALUE_TOKEN: j := 7 + ESAC + + token_response[3:0] := src3[3+4*j:4*j] + + CASE(token_response[3:0]) OF + 0 : dest[31:0] := src1[31:0] + 1 : dest[31:0] := tsrc[31:0] + 2 : dest[31:0] := QNaN(tsrc[31:0]) + 3 : dest[31:0] := QNAN_Indefinite + 4 : dest[31:0] := -INF + 5 : dest[31:0] := +INF + 6 : dest[31:0] := tsrc.sign? -INF : +INF + 7 : dest[31:0] := -0 + 8 : dest[31:0] := +0 + 9 : dest[31:0] := -1 + 10: dest[31:0] := +1 + 11: dest[31:0] := 1/2 + 12: dest[31:0] := 90.0 + 13: dest[31:0] := PI/2 + 14: dest[31:0] := MAX_FLOAT + 15: dest[31:0] := -MAX_FLOAT + ESAC + + CASE(tsrc[31:0]) OF + ZERO_VALUE_TOKEN: + IF (imm8[0]) #ZE; FI + ZERO_VALUE_TOKEN: + IF (imm8[1]) #IE; FI + ONE_VALUE_TOKEN: + IF (imm8[2]) #ZE; FI + ONE_VALUE_TOKEN: + IF (imm8[3]) #IE; FI + SNAN_TOKEN: + IF (imm8[4]) #IE; FI + NEG_INF_TOKEN: + IF (imm8[5]) #IE; FI + NEG_VALUE_TOKEN: + IF (imm8[6]) #IE; FI + POS_INF_TOKEN: + IF (imm8[7]) #IE; FI + ESAC + RETURN dest[31:0] +} +IF k[0] + dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := b[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + dst[63:0] := ConvertExpFP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + dst[63:0] := ConvertExpFP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[63:0] := ConvertExpFP64(b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + dst[31:0] := ConvertExpFP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + dst[31:0] := ConvertExpFP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + [sae_note] + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst[31:0] := ConvertExpFP32(b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + IF k[0] + dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +IF k[0] + dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] + +DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { + m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) + IF IsInf(tmp[63:0]) + tmp[63:0] := src1[63:0] + FI + RETURN tmp[63:0] +} +dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +IF k[0] + dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { + m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) + IF IsInf(tmp[31:0]) + tmp[31:0] := src1[31:0] + FI + RETURN tmp[31:0] +} +dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[31:0] +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +IF k[0] + dst[63:0] := SCALE(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +dst[63:0] := SCALE(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) + RETURN dst[63:0] +} +dst[63:0] := SCALE(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +IF k[0] + dst[31:0] := SCALE(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +dst[31:0] := SCALE(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + DEFINE SCALE(src1, src2) { + IF (src2 == NaN) + IF (src2 == SNaN) + RETURN QNAN(src2) + FI + ELSE IF (src1 == NaN) + IF (src1 == SNaN) + RETURN QNAN(src1) + FI + IF (src2 != INF) + RETURN QNAN(src1) + FI + ELSE + tmp_src2 := src2 + tmp_src1 := src1 + IF (IS_DENORMAL(src2) AND MXCSR.DAZ) + tmp_src2 := 0 + FI + IF (IS_DENORMAL(src1) AND MXCSR.DAZ) + tmp_src1 := 0 + FI + FI + dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) + RETURN dst[63:0] +} +dst[31:0] := SCALE(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + dst[i+31:i] := a[n+31:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + n := (j % 4)*32 + IF k[j] + dst[i+31:i] := a[n+31:n] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + dst[i+63:i] := a[n+63:n] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + n := (j % 4)*64 + IF k[j] + dst[i+63:i] := a[n+63:n] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 64 +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 64 +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[1:0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] +ESAC +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[1:0] OF +0: dst[127:0] := a[127:0] +1: dst[127:0] := a[255:128] +2: dst[127:0] := a[383:256] +3: dst[127:0] := a[511:384] +ESAC +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[1:0] OF +0: tmp[127:0] := a[127:0] +1: tmp[127:0] := a[255:128] +2: tmp[127:0] := a[383:256] +3: tmp[127:0] := a[511:384] +ESAC +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". + +CASE imm8[0] OF +0: dst[255:0] := a[255:0] +1: dst[255:0] := a[511:256] +ESAC +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +CASE imm8[0] OF +0: tmp[255:0] := a[255:0] +1: tmp[255:0] := a[511:256] +ESAC +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +2: dst[383:256] := b[127:0] +3: dst[511:384] := b[127:0] +ESAC +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[0]) OF +0: dst[255:0] := b[255:0] +1: dst[511:256] := b[255:0] +ESAC +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: dst[127:0] := b[127:0] +1: dst[255:128] := b[127:0] +2: dst[383:256] := b[127:0] +3: dst[511:384] := b[127:0] +ESAC +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[1:0]) OF +0: tmp[127:0] := b[127:0] +1: tmp[255:128] := b[127:0] +2: tmp[383:256] := b[127:0] +3: tmp[511:384] := b[127:0] +ESAC +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". + +dst[511:0] := a[511:0] +CASE (imm8[0]) OF +0: dst[255:0] := b[255:0] +1: dst[511:256] := b[255:0] +ESAC +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[511:0] := a[511:0] +CASE (imm8[0]) OF +0: tmp[255:0] := b[255:0] +1: tmp[511:256] := b[255:0] +ESAC +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 32 +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[m+size-1:m] := a[i+31:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 64 +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 64 +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[m+size-1:m] := a[i+63:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := idx[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + off := idx[i+3:i]*32 + dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := idx[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + off := idx[i+2:i]*64 + dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI +IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI +IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI +IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI +IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI +IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI +IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI +IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI +IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI +IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI +IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI +IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI +IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI +IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI +IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI +IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI +IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI +IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI +IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI +IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI +IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI +IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI +IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI +IF (imm8[4] == 0) dst[319:256] := a[319:256]; FI +IF (imm8[4] == 1) dst[319:256] := a[383:320]; FI +IF (imm8[5] == 0) dst[383:320] := a[319:256]; FI +IF (imm8[5] == 1) dst[383:320] := a[383:320]; FI +IF (imm8[6] == 0) dst[447:384] := a[447:384]; FI +IF (imm8[6] == 1) dst[447:384] := a[511:448]; FI +IF (imm8[7] == 0) dst[511:448] := a[447:384]; FI +IF (imm8[7] == 1) dst[511:448] := a[511:448]; FI +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + +IF (b[1] == 0) dst[63:0] := a[63:0]; FI +IF (b[1] == 1) dst[63:0] := a[127:64]; FI +IF (b[65] == 0) dst[127:64] := a[63:0]; FI +IF (b[65] == 1) dst[127:64] := a[127:64]; FI +IF (b[129] == 0) dst[191:128] := a[191:128]; FI +IF (b[129] == 1) dst[191:128] := a[255:192]; FI +IF (b[193] == 0) dst[255:192] := a[191:128]; FI +IF (b[193] == 1) dst[255:192] := a[255:192]; FI +IF (b[257] == 0) dst[319:256] := a[319:256]; FI +IF (b[257] == 1) dst[319:256] := a[383:320]; FI +IF (b[321] == 0) dst[383:320] := a[319:256]; FI +IF (b[321] == 1) dst[383:320] := a[383:320]; FI +IF (b[385] == 0) dst[447:384] := a[447:384]; FI +IF (b[385] == 1) dst[447:384] := a[511:448]; FI +IF (b[449] == 0) dst[511:448] := a[447:384]; FI +IF (b[449] == 1) dst[511:448] := a[511:448]; FI +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) +tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) +tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) +tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) +tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) +tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) +tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) +tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) +tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) +tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) +tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) +tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) +tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) +tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) +tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) +tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) +tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) +tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) +tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) +tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) +tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) +tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], b[1:0]) +dst[63:32] := SELECT4(a[127:0], b[33:32]) +dst[95:64] := SELECT4(a[127:0], b[65:64]) +dst[127:96] := SELECT4(a[127:0], b[97:96]) +dst[159:128] := SELECT4(a[255:128], b[129:128]) +dst[191:160] := SELECT4(a[255:128], b[161:160]) +dst[223:192] := SELECT4(a[255:128], b[193:192]) +dst[255:224] := SELECT4(a[255:128], b[225:224]) +dst[287:256] := SELECT4(a[383:256], b[257:256]) +dst[319:288] := SELECT4(a[383:256], b[289:288]) +dst[351:320] := SELECT4(a[383:256], b[321:320]) +dst[383:352] := SELECT4(a[383:256], b[353:352]) +dst[415:384] := SELECT4(a[511:384], b[385:384]) +dst[447:416] := SELECT4(a[511:384], b[417:416]) +dst[479:448] := SELECT4(a[511:384], b[449:448]) +dst[511:480] := SELECT4(a[511:384], b[481:480]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + dst[i+63:i] := a[id+63:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + IF k[j] + dst[i+63:i] := a[id+63:id] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[63:0] := src[63:0] + 1: tmp[63:0] := src[127:64] + 2: tmp[63:0] := src[191:128] + 3: tmp[63:0] := src[255:192] + ESAC + RETURN tmp[63:0] +} +dst[63:0] := SELECT4(a[255:0], imm8[1:0]) +dst[127:64] := SELECT4(a[255:0], imm8[3:2]) +dst[191:128] := SELECT4(a[255:0], imm8[5:4]) +dst[255:192] := SELECT4(a[255:0], imm8[7:6]) +dst[319:256] := SELECT4(a[511:256], imm8[1:0]) +dst[383:320] := SELECT4(a[511:256], imm8[3:2]) +dst[447:384] := SELECT4(a[511:256], imm8[5:4]) +dst[511:448] := SELECT4(a[511:256], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + id := idx[i+2:i]*64 + dst[i+63:i] := a[id+63:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[m+31:m] + m := m + 32 + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[m+63:m] + m := m + 64 + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[127:0] := src[127:0] + 1: tmp[127:0] := src[255:128] + 2: tmp[127:0] := src[383:256] + 3: tmp[127:0] := src[511:384] + ESAC + RETURN tmp[127:0] +} +dst[127:0] := SELECT4(a[511:0], imm8[1:0]) +dst[255:128] := SELECT4(a[511:0], imm8[3:2]) +dst[383:256] := SELECT4(b[511:0], imm8[5:4]) +dst[511:384] := SELECT4(b[511:0], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] +tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] +tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] +tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] +tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] +tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] +tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". + +dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] +dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] +dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] +dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] +dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] +dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +dst[127:96] := SELECT4(b[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(b[255:128], imm8[5:4]) +dst[255:224] := SELECT4(b[255:128], imm8[7:6]) +dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +dst[351:320] := SELECT4(b[383:256], imm8[5:4]) +dst[383:352] := SELECT4(b[383:256], imm8[7:6]) +dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +dst[479:448] := SELECT4(b[511:384], imm8[5:4]) +dst[511:480] := SELECT4(b[511:384], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp_dst[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) +dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) +dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) +dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0 + + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + ELSE + dst[m+63:m] := src[m+63:m] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + m := j*64 + IF k[j] + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) + ELSE + dst[m+63:m] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [round2_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [round2_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round2_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round2_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round2_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round2_note] + +FOR j := 0 to 15 + i := 16*j + l := 32*j + IF k[j] + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [sae_note] + +IF k[0] + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [sae_note] + +IF k[0] + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + [round_note] + +dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + [round_note] + +dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 32*j + l := 64*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + [sae_note] + +dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + l := j*32 + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + IF k[j] + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[k+7:k] := Truncate8(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Truncate8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[k+31:k] := Truncate32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Truncate32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Truncate32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[k+15:k] := Truncate16(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Truncate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[k+7:k] := Saturate8(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := Saturate8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[k+31:k] := Saturate32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := Saturate32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[k+15:k] := Saturate16(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := Saturate16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[i+31:i] := SignExtend32(a[k+7:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[i+63:i] := SignExtend64(a[k+7:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := SignExtend64(a[k+31:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[i+31:i] := SignExtend32(a[k+15:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + l := j*16 + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := SignExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[i+63:i] := SignExtend64(a[k+15:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := SignExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+31:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+31:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[k+7:k] := SaturateU8(a[i+63:i]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := src[l+7:l] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[l+7:l] := SaturateU8(a[i+63:i]) + ELSE + dst[l+7:l] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[k+31:k] := SaturateU32(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[l+31:l] := SaturateU32(a[i+63:i]) + ELSE + dst[l+31:l] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[k+15:k] := SaturateU16(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := src[l+15:l] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + Store + + + + + Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[l+15:l] := SaturateU16(a[i+63:i]) + ELSE + dst[l+15:l] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 8*j + dst[i+31:i] := ZeroExtend32(a[k+7:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 8*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+7:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 8*j + dst[i+63:i] := ZeroExtend64(a[k+7:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 8*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+7:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 32*j + dst[i+63:i] := ZeroExtend64(a[k+31:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 32*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+31:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 15 + i := 32*j + k := 16*j + dst[i+31:i] := ZeroExtend32(a[k+15:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := 32*j + l := 16*j + IF k[j] + dst[i+31:i] := ZeroExtend32(a[l+15:l]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := 64*j + k := 16*j + dst[i+63:i] := ZeroExtend64(a[k+15:k]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := 64*j + l := 16*j + IF k[j] + dst[i+63:i] := ZeroExtend64(a[l+15:l]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". + +dst[31:0] := a[31:0] + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". + +dst[63:0] := a[63:0] + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] + +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] + +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MAX(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] + +dst[63:0] := MAX(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := MAX(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +dst[31:0] := MAX(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note][min_float_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note][min_float_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := MIN(a[63:0], b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] + +dst[63:0] := MIN(a[63:0], b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := MIN(a[31:0], b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +dst[31:0] := MIN(a[31:0], b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ABS(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ABS(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] +tmp[319:256] := a[319:256] +tmp[383:320] := a[319:256] +tmp[447:384] := a[447:384] +tmp[511:448] := a[447:384] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[63:0] := a[63:0] +tmp[127:64] := a[63:0] +tmp[191:128] := a[191:128] +tmp[255:192] := a[191:128] +tmp[319:256] := a[319:256] +tmp[383:320] := a[319:256] +tmp[447:384] := a[447:384] +tmp[511:448] := a[447:384] +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := tmp[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". + +dst[63:0] := a[63:0] +dst[127:64] := a[63:0] +dst[191:128] := a[191:128] +dst[255:192] := a[191:128] +dst[319:256] := a[319:256] +dst[383:320] := a[319:256] +dst[447:384] := a[447:384] +dst[511:448] := a[447:384] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + + Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := b[63:0] +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := b[63:0] +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] +tmp[287:256] := a[319:288] +tmp[319:288] := a[319:288] +tmp[351:320] := a[383:352] +tmp[383:352] := a[383:352] +tmp[415:384] := a[447:416] +tmp[447:416] := a[447:416] +tmp[479:448] := a[511:480] +tmp[511:480] := a[511:480] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[31:0] := a[63:32] +tmp[63:32] := a[63:32] +tmp[95:64] := a[127:96] +tmp[127:96] := a[127:96] +tmp[159:128] := a[191:160] +tmp[191:160] := a[191:160] +tmp[223:192] := a[255:224] +tmp[255:224] := a[255:224] +tmp[287:256] := a[319:288] +tmp[319:288] := a[319:288] +tmp[351:320] := a[383:352] +tmp[383:352] := a[383:352] +tmp[415:384] := a[447:416] +tmp[447:416] := a[447:416] +tmp[479:448] := a[511:480] +tmp[511:480] := a[511:480] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[63:32] +dst[63:32] := a[63:32] +dst[95:64] := a[127:96] +dst[127:96] := a[127:96] +dst[159:128] := a[191:160] +dst[191:160] := a[191:160] +dst[223:192] := a[255:224] +dst[255:224] := a[255:224] +dst[287:256] := a[319:288] +dst[319:288] := a[319:288] +dst[351:320] := a[383:352] +dst[383:352] := a[383:352] +dst[415:384] := a[447:416] +dst[447:416] := a[447:416] +dst[479:448] := a[511:480] +dst[511:480] := a[511:480] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +tmp[287:256] := a[287:256] +tmp[319:288] := a[287:256] +tmp[351:320] := a[351:320] +tmp[383:352] := a[351:320] +tmp[415:384] := a[415:384] +tmp[447:416] := a[415:384] +tmp[479:448] := a[479:448] +tmp[511:480] := a[479:448] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +tmp[31:0] := a[31:0] +tmp[63:32] := a[31:0] +tmp[95:64] := a[95:64] +tmp[127:96] := a[95:64] +tmp[159:128] := a[159:128] +tmp[191:160] := a[159:128] +tmp[223:192] := a[223:192] +tmp[255:224] := a[223:192] +tmp[287:256] := a[287:256] +tmp[319:288] := a[287:256] +tmp[351:320] := a[351:320] +tmp[383:352] := a[351:320] +tmp[415:384] := a[415:384] +tmp[447:416] := a[415:384] +tmp[479:448] := a[479:448] +tmp[511:480] := a[479:448] +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[31:0] +dst[63:32] := a[31:0] +dst[95:64] := a[95:64] +dst[127:96] := a[95:64] +dst[159:128] := a[159:128] +dst[191:160] := a[159:128] +dst[223:192] := a[223:192] +dst[255:224] := a[223:192] +dst[287:256] := a[287:256] +dst[319:288] := a[287:256] +dst[351:320] := a[351:320] +dst[383:352] := a[351:320] +dst[415:384] := a[415:384] +dst[447:416] := a[415:384] +dst[479:448] := a[479:448] +dst[511:480] := a[479:448] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + + Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := b[31:0] +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := b[31:0] +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 15 + i := j*32 + IF k[j] + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 15 + i := j*32 + IF k[j] + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 15 + i := j*32 + FOR h := 0 to 31 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 7 + i := j*64 + IF k[j] + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 7 + i := j*64 + IF k[j] + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". + +DEFINE TernaryOP(imm8, a, b, c) { + CASE imm8[7:0] OF + 0: dst[0] := 0 // imm8[7:0] := 0 + 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) + // ... + 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C + 255: dst[0] := 1 // imm8[7:0] := 1 + ESAC +} +imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) +FOR j := 0 to 7 + i := j*64 + FOR h := 0 to 63 + dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 7 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 15 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. + +FOR j := 0 to 7 + i := j*64 + k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". + +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + + + Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + + Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[31:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + Broadcast 32-bit integer "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + + + Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + + Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[63:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + Broadcast 64-bit integer "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + Broadcast the low packed 16-bit integer from "a" to all all elements of "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the repeated 4 element sequence. + +dst[31:0] := a +dst[63:32] := b +dst[95:64] := c +dst[127:96] := d +dst[159:128] := a +dst[191:160] := b +dst[223:192] := c +dst[255:224] := d +dst[287:256] := a +dst[319:288] := b +dst[351:320] := c +dst[383:352] := d +dst[415:384] := a +dst[447:416] := b +dst[479:448] := c +dst[511:480] := d +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed 64-bit integers in "dst" with the repeated 4 element sequence. + +dst[63:0] := a +dst[127:64] := b +dst[191:128] := c +dst[255:192] := d +dst[319:256] := a +dst[383:320] := b +dst[447:384] := c +dst[511:448] := d +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence. + +dst[63:0] := a +dst[127:64] := b +dst[191:128] := c +dst[255:192] := d +dst[319:256] := a +dst[383:320] := b +dst[447:384] := c +dst[511:448] := d +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence. + +dst[31:0] := a +dst[63:32] := b +dst[95:64] := c +dst[127:96] := d +dst[159:128] := a +dst[191:160] := b +dst[223:192] := c +dst[255:224] := d +dst[287:256] := a +dst[319:288] := b +dst[351:320] := c +dst[383:352] := d +dst[415:384] := a +dst[447:416] := b +dst[479:448] := c +dst[511:480] := d +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. + +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 +dst[71:64] := e8 +dst[79:72] := e9 +dst[87:80] := e10 +dst[95:88] := e11 +dst[103:96] := e12 +dst[111:104] := e13 +dst[119:112] := e14 +dst[127:120] := e15 +dst[135:128] := e16 +dst[143:136] := e17 +dst[151:144] := e18 +dst[159:152] := e19 +dst[167:160] := e20 +dst[175:168] := e21 +dst[183:176] := e22 +dst[191:184] := e23 +dst[199:192] := e24 +dst[207:200] := e25 +dst[215:208] := e26 +dst[223:216] := e27 +dst[231:224] := e28 +dst[239:232] := e29 +dst[247:240] := e30 +dst[255:248] := e31 +dst[263:256] := e32 +dst[271:264] := e33 +dst[279:272] := e34 +dst[287:280] := e35 +dst[295:288] := e36 +dst[303:296] := e37 +dst[311:304] := e38 +dst[319:312] := e39 +dst[327:320] := e40 +dst[335:328] := e41 +dst[343:336] := e42 +dst[351:344] := e43 +dst[359:352] := e44 +dst[367:360] := e45 +dst[375:368] := e46 +dst[383:376] := e47 +dst[391:384] := e48 +dst[399:392] := e49 +dst[407:400] := e50 +dst[415:408] := e51 +dst[423:416] := e52 +dst[431:424] := e53 +dst[439:432] := e54 +dst[447:440] := e55 +dst[455:448] := e56 +dst[463:456] := e57 +dst[471:464] := e58 +dst[479:472] := e59 +dst[487:480] := e60 +dst[495:488] := e61 +dst[503:496] := e62 +dst[511:504] := e63 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values. + +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 +dst[79:64] := e4 +dst[95:80] := e5 +dst[111:96] := e6 +dst[127:112] := e7 +dst[143:128] := e8 +dst[159:144] := e9 +dst[175:160] := e10 +dst[191:176] := e11 +dst[207:192] := e12 +dst[223:208] := e13 +dst[239:224] := e14 +dst[255:240] := e15 +dst[271:256] := e16 +dst[287:272] := e17 +dst[303:288] := e18 +dst[319:304] := e19 +dst[335:320] := e20 +dst[351:336] := e21 +dst[367:352] := e22 +dst[383:368] := e23 +dst[399:384] := e24 +dst[415:400] := e25 +dst[431:416] := e26 +dst[447:432] := e27 +dst[463:448] := e28 +dst[479:464] := e29 +dst[495:480] := e30 +dst[511:496] := e31 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 +dst[287:256] := e8 +dst[319:288] := e9 +dst[351:320] := e10 +dst[383:352] := e11 +dst[415:384] := e12 +dst[447:416] := e13 +dst[479:448] := e14 +dst[511:480] := e15 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 64-bit integers in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 +dst[319:256] := e4 +dst[383:320] := e5 +dst[447:384] := e6 +dst[511:448] := e7 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 +dst[191:128] := e2 +dst[255:192] := e3 +dst[319:256] := e4 +dst[383:320] := e5 +dst[447:384] := e6 +dst[511:448] := e7 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 +dst[159:128] := e4 +dst[191:160] := e5 +dst[223:192] := e6 +dst[255:224] := e7 +dst[287:256] := e8 +dst[319:288] := e9 +dst[351:320] := e10 +dst[383:352] := e11 +dst[415:384] := e12 +dst[447:416] := e13 +dst[479:448] := e14 +dst[511:480] := e15 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order. + +dst[31:0] := d +dst[63:32] := c +dst[95:64] := b +dst[127:96] := a +dst[159:128] := d +dst[191:160] := c +dst[223:192] := b +dst[255:224] := a +dst[287:256] := d +dst[319:288] := c +dst[351:320] := b +dst[383:352] := a +dst[415:384] := d +dst[447:416] := c +dst[479:448] := b +dst[511:480] := a +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order. + +dst[63:0] := d +dst[127:64] := c +dst[191:128] := b +dst[255:192] := a +dst[319:256] := d +dst[383:320] := c +dst[447:384] := b +dst[511:448] := a +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. + +dst[63:0] := d +dst[127:64] := c +dst[191:128] := b +dst[255:192] := a +dst[319:256] := d +dst[383:320] := c +dst[447:384] := b +dst[511:448] := a +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. + +dst[31:0] := d +dst[63:32] := c +dst[95:64] := b +dst[127:96] := a +dst[159:128] := d +dst[191:160] := c +dst[223:192] := b +dst[255:224] := a +dst[287:256] := d +dst[319:288] := c +dst[351:320] := b +dst[383:352] := a +dst[415:384] := d +dst[447:416] := c +dst[479:448] := b +dst[511:480] := a +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e15 +dst[63:32] := e14 +dst[95:64] := e13 +dst[127:96] := e12 +dst[159:128] := e11 +dst[191:160] := e10 +dst[223:192] := e9 +dst[255:224] := e8 +dst[287:256] := e7 +dst[319:288] := e6 +dst[351:320] := e5 +dst[383:352] := e4 +dst[415:384] := e3 +dst[447:416] := e2 +dst[479:448] := e1 +dst[511:480] := e0 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed 64-bit integers in "dst" with the supplied values in reverse order. + +dst[63:0] := e7 +dst[127:64] := e6 +dst[191:128] := e5 +dst[255:192] := e4 +dst[319:256] := e3 +dst[383:320] := e2 +dst[447:384] := e1 +dst[511:448] := e0 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[63:0] := e7 +dst[127:64] := e6 +dst[191:128] := e5 +dst[255:192] := e4 +dst[319:256] := e3 +dst[383:320] := e2 +dst[447:384] := e1 +dst[511:448] := e0 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[31:0] := e15 +dst[63:32] := e14 +dst[95:64] := e13 +dst[127:96] := e12 +dst[159:128] := e11 +dst[191:160] := e10 +dst[223:192] := e9 +dst[255:224] := e8 +dst[287:256] := e7 +dst[319:288] := e6 +dst[351:320] := e5 +dst[383:352] := e4 +dst[415:384] := e3 +dst[447:416] := e2 +dst[479:448] := e1 +dst[511:480] := e0 +dst[MAX:512] := 0 + + AVX512F +
immintrin.h
+ Set +
+ + + + Return vector of type __m512 with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + Return vector of type __m512i with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + Return vector of type __m512d with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + Return vector of type __m512 with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + Return vector of type __m512i with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512F +
immintrin.h
+ Set +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src << count) OR (src >> (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE LEFT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src << count) OR (src >> (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { + count := count_src % 32 + RETURN (src >>count) OR (src << (32 - count)) +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". + +DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { + count := count_src % 64 + RETURN (src >> count) OR (src << (64 - count)) +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) + ELSE + dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF count[i+63:i] < 64 + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[63:0] := (1.0 / b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +dst[63:0] := (1.0 / b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[31:0] := (1.0 / b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + +dst[31:0] := (1.0 / b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[63:0] := (1.0 / SQRT(b[63:0])) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[63:0] := (1.0 / SQRT(b[63:0])) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. + +dst[63:0] := (1.0 / SQRT(b[63:0])) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[31:0] := (1.0 / SQRT(b[31:0])) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + +IF k[0] + dst[31:0] := (1.0 / SQRT(b[31:0])) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. + +dst[31:0] := (1.0 / SQRT(b[31:0])) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note]. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := SQRT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + [round_note]. + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := SQRT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + [round_note]. + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := src[63:0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst[63:0] := SQRT(b[63:0]) +ELSE + dst[63:0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := SQRT(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := src[31:0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst[31:0] := SQRT(b[31:0]) +ELSE + dst[31:0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := SQRT(b[31:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512F +
immintrin.h
+ Elementary Math Functions +
+ + + + Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m128d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m128. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m256d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m256. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m128i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m256i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Return vector of type __m512 with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + Return vector of type __m512i with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + Return vector of type __m512d with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + Return vector of type __m512 with undefined elements. + AVX512F +
immintrin.h
+ General Support +
+ + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := c[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := c[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". + +FOR j := 0 to 15 + i := j*32 + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 + FI +ENDFOR +dst[31:0] := REDUCE_ADD(tmp, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 + FI +ENDFOR +dst[63:0] := REDUCE_ADD(tmp, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 + FI +ENDFOR +dst[63:0] := REDUCE_ADD(tmp, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 + FI +ENDFOR +dst[31:0] := REDUCE_ADD(tmp, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 1 + FI +ENDFOR +dst[31:0] := REDUCE_MUL(tmp, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 1 + FI +ENDFOR +dst[63:0] := REDUCE_MUL(tmp, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 1.0 + FI +ENDFOR +dst[63:0] := REDUCE_MUL(tmp, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := FP32(1.0) + FI +ENDFOR +dst[31:0] := REDUCE_MUL(tmp, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_ADD(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_ADD(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[63:0] + src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_ADD(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_ADD(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +DEFINE REDUCE_ADD(src, len) { + IF len == 2 + RETURN src[31:0] + src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_ADD(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_ADD(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MUL(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MUL(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[63:0] * src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_MUL(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MUL(a, 8) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +DEFINE REDUCE_MUL(src, len) { + IF len == 2 + RETURN src[31:0] * src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_MUL(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MUL(a, 16) + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ABS(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ABS(v2[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ABS(v2[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ABS(v2[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst". + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) +dst[511:0] := temp[511:0] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +temp[1023:512] := a[511:0] +temp[511:0] := b[511:0] +temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := temp[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + [sae_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. + [getmant_note][sae_note] + FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name. + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + IF k[j] + dst[i+31:i] := a[id+31:id] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name. + +FOR j := 0 to 15 + i := j*32 + id := idx[i+3:i]*32 + dst[i+31:i] := a[id+31:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := tmp_dst[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) +dst[159:128] := SELECT4(a[255:128], imm8[1:0]) +dst[191:160] := SELECT4(a[255:128], imm8[3:2]) +dst[223:192] := SELECT4(a[255:128], imm8[5:4]) +dst[255:224] := SELECT4(a[255:128], imm8[7:6]) +dst[287:256] := SELECT4(a[383:256], imm8[1:0]) +dst[319:288] := SELECT4(a[383:256], imm8[3:2]) +dst[351:320] := SELECT4(a[383:256], imm8[5:4]) +dst[383:352] := SELECT4(a[383:256], imm8[7:6]) +dst[415:384] := SELECT4(a[511:384], imm8[1:0]) +dst[447:416] := SELECT4(a[511:384], imm8[3:2]) +dst[479:448] := SELECT4(a[511:384], imm8[5:4]) +dst[511:480] := SELECT4(a[511:384], imm8[7:6]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Swizzle +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". + +FOR j := 0 to 7 + i := j*64 + k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". + FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". + FOR j := 0 to 7 + i := j*64 + k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 7 + i := j*64 + IF k1[j] + k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". + FOR j := 0 to 15 + i := j*32 + k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". + FOR j := 0 to 15 + i := j*32 + k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". + +FOR j := 0 to 15 + i := j*32 + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[2:0]) OF +0: OP := _MM_CMPINT_EQ +1: OP := _MM_CMPINT_LT +2: OP := _MM_CMPINT_LE +3: OP := _MM_CMPINT_FALSE +4: OP := _MM_CMPINT_NE +5: OP := _MM_CMPINT_NLT +6: OP := _MM_CMPINT_NLE +7: OP := _MM_CMPINT_TRUE +ESAC +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Compare +
+ + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+31:i] := MEM[addr+31:addr] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst". + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" them in "dst". + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + + + Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + dst[i+63:i] := MEM[addr+63:addr] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Load +
+ + + + + + Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Move +
+ + + + + + Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed 32-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 15 + i := j*32 + IF k[j] + MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits of integer data from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + Store packed 64-bit integers from "a" into memory using writemask "k". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +FOR j := 0 to 7 + i := j*64 + IF k[j] + MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. + +FOR j := 0 to 15 + i := j*32 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+31:addr] := a[i+31:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] AND b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[511:0] := (a[511:0] AND b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst". + +dst[511:0] := ((NOT a[511:0]) AND b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst". + +dst[511:0] := (a[511:0] AND b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] AND b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] OR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[511:0] := (a[511:0] OR b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] OR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*32 + IF k1[j] + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. + +FOR j := 0 to 15 + i := j*32 + k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[511:0] := (a[511:0] XOR b[511:0]) +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[31:0] AND src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_AND(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0xFFFFFFFF + FI +ENDFOR +dst[31:0] := REDUCE_AND(tmp, 16) + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[63:0] AND src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_AND(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF + FI +ENDFOR +dst[63:0] := REDUCE_AND(tmp, 8) + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[31:0] OR src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_OR(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 + FI +ENDFOR +dst[31:0] := REDUCE_OR(tmp, 16) + + AVX512F +
immintrin.h
+ Logical +
+ + + + + Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[63:0] OR src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_OR(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 + FI +ENDFOR +dst[63:0] := REDUCE_OR(tmp, 8) + + AVX512F +
immintrin.h
+ Logical +
+ + + + Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[31:0] AND src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_AND(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_AND(a, 16) + + AVX512F +
immintrin.h
+ Logical +
+ + + + Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". + +DEFINE REDUCE_AND(src, len) { + IF len == 2 + RETURN src[63:0] AND src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_AND(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_AND(a, 8) + + AVX512F +
immintrin.h
+ Logical +
+ + + + Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[31:0] OR src[63:32] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] + ENDFOR + RETURN REDUCE_OR(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_OR(a, 16) + + AVX512F +
immintrin.h
+ Logical +
+ + + + Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". + +DEFINE REDUCE_OR(src, len) { + IF len == 2 + RETURN src[63:0] OR src[127:64] + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] + ENDFOR + RETURN REDUCE_OR(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_OR(a, 8) + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := v2[i+31:i] & v3[i+31:i] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Logical +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Int32(-0x80000000) + FI +ENDFOR +dst[31:0] := REDUCE_MAX(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Int64(-0x8000000000000000) + FI +ENDFOR +dst[63:0] := REDUCE_MAX(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0 + FI +ENDFOR +dst[31:0] := REDUCE_MAX(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0 + FI +ENDFOR +dst[63:0] := REDUCE_MAX(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Cast_FP64(0xFFEFFFFFFFFFFFFF) + FI +ENDFOR +dst[63:0] := REDUCE_MAX(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Cast_FP32(0xFF7FFFFF) + FI +ENDFOR +dst[31:0] := REDUCE_MAX(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Int32(0x7FFFFFFF) + FI +ENDFOR +dst[31:0] := REDUCE_MIN(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Int64(0x7FFFFFFFFFFFFFFF) + FI +ENDFOR +dst[63:0] := REDUCE_MIN(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := 0xFFFFFFFF + FI +ENDFOR +dst[31:0] := REDUCE_MIN(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF + FI +ENDFOR +dst[63:0] := REDUCE_MIN(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +tmp := a +FOR j := 0 to 8 + i := j*64 + IF k[j] + tmp[i+63:i] := a[i+63:i] + ELSE + tmp[i+63:i] := Cast_FP64(0x7FEFFFFFFFFFFFFF) + FI +ENDFOR +dst[63:0] := REDUCE_MIN(tmp, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +tmp := a +FOR j := 0 to 16 + i := j*32 + IF k[j] + tmp[i+31:i] := a[i+31:i] + ELSE + tmp[i+31:i] := Cast_FP32(0x7F7FFFFF) + FI +ENDFOR +dst[31:0] := REDUCE_MIN(tmp, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MAX(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MAX(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + +DEFINE REDUCE_MAX(src, len) { + IF len == 2 + RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MAX(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MAX(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*64 + src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) + ENDFOR + RETURN REDUCE_MIN(src[64*len-1:0], len) +} +dst[63:0] := REDUCE_MIN(a, 8) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +DEFINE REDUCE_MIN(src, len) { + IF len == 2 + RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) + FI + len := len / 2 + FOR j:= 0 to (len-1) + i := j*32 + src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) + ENDFOR + RETURN REDUCE_MIN(src[32*len-1:0], len) +} +dst[31:0] := REDUCE_MIN(a, 16) + + AVX512F +
immintrin.h
+ Special Math Functions +
+ + + + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 15 + i := j*32 + IF count[i+31:i] < 32 + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Shift +
+ + + + Cast vector of type __m512d to type __m512. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512d to type __m512i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m512d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512 to type __m512i. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m512d. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Cast vector of type __m512i to type __m512. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512F +
immintrin.h
+ Cast +
+ + + + Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". + +FOR j := 0 to 7 + i := j*32 + n := j*64 + dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i]) + ELSE + dst[l+63:l] := src[l+63:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". + +FOR j := 0 to 7 + i := j*32 + l := j*64 + dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + n := j*64 + IF k[j] + dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) + ELSE + dst[n+63:n] := src[n+63:n] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". + +FOR j := 0 to 7 + i := j*32 + n := j*64 + dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + l := j*64 + IF k[j] + dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) + ELSE + dst[l+63:l] := src[l+63:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. + +FOR j := 0 to 7 + i := j*64 + k := j*32 + dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. + +FOR j := 0 to 7 + i := j*64 + l := j*32 + IF k[j] + dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) + ELSE + dst[l+31:l] := src[l+31:l] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". + +FOR j := 0 to 7 + i := j*64 + m := j*32 + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + + Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory). + +FOR j := 0 to 7 + i := j*64 + m := j*32 + IF k[j] + addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 + MEM[addr+63:addr] := a[i+63:i] + FI +ENDFOR + + + AVX512F +
immintrin.h
+ Store +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512IFMA52 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512IFMA52 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512IFMA52 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512IFMA52 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512IFMA52 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512IFMA52 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) + dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512IFMA52 +
immintrin.h
+ Arithmetic +
+ + + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := POPCNT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := POPCNT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := POPCNT(a[i+63:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := POPCNT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := POPCNT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POPCNT(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := POPCNT(a[i+31:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := POPCNT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := POPCNT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POPCNT(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := POPCNT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := POPCNT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512VPOPCNTDQ + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := POPCNT(a[i+31:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512VPOPCNTDQ +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POPCNT(a[i+31:i]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512VPOPCNTDQ +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := POPCNT(a[i+31:i]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512VPOPCNTDQ +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := POPCNT(a[i+63:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512VPOPCNTDQ +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POPCNT(a[i+63:i]) + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512VPOPCNTDQ +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := POPCNT(a[i+63:i]) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512VPOPCNTDQ +
immintrin.h
+ Bit Manipulation +
+ + + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 15 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:512] := 0 + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 15 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert the BF16 (16-bit) floating-point element in "a" to a floating-point element, and store the result in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +dst[31:0] := Convert_BF16_To_FP32(a[15:0]) + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". + +FOR j := 0 to 31 + IF j < 16 + t := b.fp32[j] + ELSE + t := a.fp32[j-16] + FI + dst.word[j] := Convert_FP32_To_BF16(t) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF j < 16 + t := b.fp32[j] + ELSE + t := a.fp32[j-16] + FI + dst.word[j] := Convert_FP32_To_BF16(t) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF j < 16 + t := b.fp32[j] + ELSE + t := a.fp32[j-16] + FI + dst.word[j] := Convert_FP32_To_BF16(t) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Convert +
+ + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 15 + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BF16 + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 3 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:128] := 0 + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 3 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 3 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 7 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:256] := 0 + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 7 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. + +FOR j := 0 to 7 + i := j*32 + m := j*16 + IF k[j] + dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert the single-precision (32-bit) floating-point element in "a" to a BF16 (16-bit) floating-point element, and store the result in "dst". + +dst[15:0] := Convert_FP32_To_BF16(a[31:0]) + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". + +FOR j := 0 to 7 + IF j < 4 + t := b.fp32[j] + ELSE + t := a.fp32[j-4] + FI + dst.word[j] := Convert_FP32_To_BF16(t) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF j < 4 + t := b.fp32[j] + ELSE + t := a.fp32[j-4] + FI + dst.word[j] := Convert_FP32_To_BF16(t) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF j < 4 + t := b.fp32[j] + ELSE + t := a.fp32[j-4] + FI + dst.word[j] := Convert_FP32_To_BF16(t) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". + +FOR j := 0 to 15 + IF j < 8 + t := b.fp32[j] + ELSE + t := a.fp32[j-8] + FI + dst.word[j] := Convert_FP32_To_BF16(t) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF j < 8 + t := b.fp32[j] + ELSE + t := a.fp32[j-8] + FI + dst.word[j] := Convert_FP32_To_BF16(t) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF j < 8 + t := b.fp32[j] + ELSE + t := a.fp32[j-8] + FI + dst.word[j] := Convert_FP32_To_BF16(t) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 3 + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 3 + IF k[j] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 3 + IF k[j] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 7 + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 7 + IF k[j] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE make_fp32(x[15:0]) { + y.fp32 := 0.0 + y[31:16] := x[15:0] + RETURN y +} +dst := src +FOR j := 0 to 7 + IF k[j] + dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) + dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BF16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 3 //Qword + FOR j := 0 to 7 // Byte + IF k[i*8+j] + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ELSE + dst[i*8+j] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:32] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + +FOR i := 0 to 3 //Qword + FOR j := 0 to 7 // Byte + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ENDFOR +ENDFOR +dst[MAX:32] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 1 //Qword + FOR j := 0 to 7 // Byte + IF k[i*8+j] + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ELSE + dst[i*8+j] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:16] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + +FOR i := 0 to 1 //Qword + FOR j := 0 to 7 // Byte + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ENDFOR +ENDFOR +dst[MAX:16] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POPCNT(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := POPCNT(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := POPCNT(a[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := POPCNT(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := POPCNT(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := POPCNT(a[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*8 + dst[i+7:i] := POPCNT(a[i+7:i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := POPCNT(a[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := POPCNT(a[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := POPCNT(a[i+7:i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := POPCNT(a[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := POPCNT(a[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_BITALG + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 7 //Qword + FOR j := 0 to 7 // Byte + IF k[i*8+j] + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ELSE + dst[i*8+j] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:64] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". + +FOR i := 0 to 7 //Qword + FOR j := 0 to 7 // Byte + m := c.qword[i].byte[j] & 0x3F + dst[i*8+j] := b.qword[i].bit[m] + ENDFOR +ENDFOR +dst[MAX:64] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := POPCNT(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := POPCNT(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := POPCNT(a[i+15:i]) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 63 + i := j*8 + dst[i+7:i] := POPCNT(a[i+7:i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := POPCNT(a[i+7:i]) + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE POPCNT(a) { + count := 0 + DO WHILE a > 0 + count += a[0] + a >>= 1 + OD + RETURN count +} +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := POPCNT(a[i+7:i]) + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_BITALG +
immintrin.h
+ Bit Manipulation +
+ + + + + Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ACOS(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ACOSH(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ASIN(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ASINH(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ATAN(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ATANH(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := CubeRoot(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := CDFNormal(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := InverseCDFNormal(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := COS(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := COSD(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := COSH(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ERF(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := 1.0 - ERF(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := 1.0 / ERF(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POW(FP16(e), a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0)) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := InvCubeRoot(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := InvSQRT(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := LOG(1.0 + a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + + Elementary Math FunctionsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ConvertExpFP16(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := POW(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SIN(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SIN(a[i+15:i]) + MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +cos_res[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SIND(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SINH(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := CEIL(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := FLOOR(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ROUND(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". + + Elementary Math Functions +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := SQRT(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := TAN(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := TAND(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := TANH(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst" + + Special Math FunctionsFOR j := 0 to 15 + i := j*16 + dst[i+15:i] := TRUNCATE(a[i+15:i]) +ENDFOR +dst[MAX:256] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ACOS(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ACOSH(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ASIN(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ASINH(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ATAN(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperblic tangent of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ATANH(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := CubeRoot(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := CDFNormal(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := InverseCDFNormal(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := CEIL(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := COS(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := COSD(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := COSH(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ERF(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := 1.0 - ERF(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := 1.0 / ERF(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := POW(FP16(e), a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := FLOOR(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0)) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := InvSQRT(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := LOG(1.0 + a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + + Elementary Math FunctionsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ConvertExpFP16(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ACOS(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ACOSH(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ASIN(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ASINH(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ATAN(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ATANH(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math FunctionsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := CubeRoot(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := CDFNormal(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := InverseCDFNormal(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Special Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := CEIL(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := COS(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + TrigonometryFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := COSD(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := COSH(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ERF(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := 1.0 - ERF(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Probability/StatisticsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := 1.0 / ERF(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := POW(FP16(e), a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Special Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := FLOOR(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math FunctionsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := InvSQRT(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := LOG(1.0 + a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := LOG(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + + + + Elementary Math FunctionsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ConvertExpFP16(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Rounds each packed half-precision (16-bit) floating-point element in "a" to the nearest integer value and stores the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Special Math FunctionsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := NearbyInt(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Computes the reciprocal of packed half-precision (16-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := (1.0 / a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Rounds the packed half-precision (16-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Special Math FunctionsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := RoundToNearestEven(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SIN(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). + + + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SIN(a[i+15:i]) + MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) + ELSE + dst[i+15:i] := sin_src[i+15:i] + MEM[mem_addr+i+15:mem_addr+i] := cos_src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + TrigonometryFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SIND(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := SINH(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Special Math Functions +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ROUND(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := TAN(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + TrigonometryFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := TAND(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := TANH(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + + + + Special Math FunctionsFOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := TRUNCATE(a[i+15:i]) + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Rounds each packed half-precision (16-bit) floating-point element in "a" to the nearest integer value and stores the results as packed half-precision floating-point elements in "dst". + + Special Math FunctionsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := NearbyInt(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := POW(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Computes the reciprocal of packed half-precision (16-bit) floating-point elements in "a", storing the results in "dst". + + Elementary Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := (1.0 / a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Rounds the packed half-precision (16-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". + + Special Math FunctionsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := RoundToNearestEven(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SIN(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SIN(a[i+15:i]) + MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +cos_res[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SIND(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := SINH(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ROUND(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := TAN(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := TAND(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := TANH(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst". + + Special Math FunctionsFOR j := 0 to 31 + i := j*16 + dst[i+15:i] := TRUNCATE(a[i+15:i]) +ENDFOR +dst[MAX:512] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ACOS(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ACOSH(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ASIN(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ASINH(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ATAN(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ATANH(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := CubeRoot(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := CDFNormal(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := InverseCDFNormal(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := COS(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := COSD(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := COSH(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ERF(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := 1.0 - ERF(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Probability/StatisticsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := 1.0 / ERF(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := POW(FP16(e), a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0)) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := InvCubeRoot(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math FunctionsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := InvSQRT(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := LOG(1.0 + a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := LOG(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + + Elementary Math FunctionsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ConvertExpFP16(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := POW(a[i+15:i], b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SIN(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SIN(a[i+15:i]) + MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +cos_res[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SIND(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SINH(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := CEIL(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := FLOOR(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst". + + Special Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ROUND(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". + + Elementary Math Functions +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SQRT(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := TAN(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + + TrigonometryFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := TAND(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + + Trigonometry +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := TANH(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst". + + Special Math FunctionsFOR j := 0 to 7 + i := j*16 + dst[i+15:i] := TRUNCATE(a[i+15:i]) +ENDFOR +dst[MAX:128] := 0 +
immintrin.h
AVX512_FP16
+ + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := a.fp16[j] + b.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := a.fp16[j] + b.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := a.fp16[j] / b.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := a.fp16[j] / b.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 15 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR i := 0 TO 7 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR i := 0 TO 15 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 3 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 7 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (316-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". + +tmp := a +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". + +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". + +tmp := a +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := ABS(v2.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := ABS(v2.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 +ENDFOR +k[MAX:8] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 7 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 15 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Compare +
+ + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. + +FOR j := 0 TO 1 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:32] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 TO 3 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 to 3 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 to 3 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. + +FOR j := 0 to 3 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:64] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 3 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 1 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 1 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 3 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 3 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Convert +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 7 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [max_float_note] + +dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 7 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 15 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [min_float_note] + +dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Special Math Functions +
+ + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 7 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 15 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 7 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 15 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 7 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 7 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 15 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 15 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 7 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 15 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 7 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR i := 0 to 7 + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) +ENDFOR +k[MAX:8] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR i := 0 to 7 + IF k1[i] + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) + ELSE + k[i] := 0 + FI +ENDFOR +k[MAX:8] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR i := 0 to 15 + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR i := 0 to 15 + IF k1[i] + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) + ELSE + k[i] := 0 + FI +ENDFOR +k[MAX:16] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle half-precision (16-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + off := idx[i+2:i] + dst.fp16[j] := idx[i+3] ? b.fp16[off] : a.fp16[off] +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + off := idx[i+3:i] + dst.fp16[j] := idx[i+4] ? b.fp16[off] : a.fp16[off] +ENDFOR +dst[MAX:256] := 0 + + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := b.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 7 + IF k[j] + dst.fp16[j] := b.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*16 + id := idx[i+3:i] + dst.fp16[j] := a.fp16[id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle half-precision (16-bit) floating-point elements in "a" using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + id := idx[i+2:i] + dst.fp16[j] := a.fp16[id] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Miscellaneous +
+ + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR i := 0 to 7 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR i := 0 to 15 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + dst.fp16[i] := (1.0 / a.fp16[i]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 7 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + dst.fp16[i] := (1.0 / a.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Elementary Math Functions +
+ + + + Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[255:0] := MEM[mem_addr+255:mem_addr] +dst[MAX:256] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] +dst[MAX:128] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Load +
+ + + + + Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+255:mem_addr] := a[255:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Store +
+ + + + Return vector of type __m256h with undefined elements. + AVX512_FP16 + AVX512VL +
immintrin.h
+ General Support +
+ + + + Return vector of type __m128h with undefined elements. + AVX512_FP16 + AVX512VL +
immintrin.h
+ General Support +
+ + + + Return vector of type __m256h with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Set +
+ + + + Return vector of type __m128h with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512_FP16 + AVX512VL +
immintrin.h
+ Set +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] + b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] + b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] + b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] + b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] + b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] + b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := a.fp16[j] / b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := a.fp16[j] / b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := a.fp16[j] / b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] / b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] / b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] / b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := a.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := c.fp16[0] +FI +dst[127:16] := c[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". + [round_note] + +FOR j := 0 to 31 + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := c.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 31 + IF k[j] + IF ((j & 1) == 0) + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] + ELSE + dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] + FI + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := a.fp16[j] - b.fp16[j] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := a.fp16[j] - b.fp16[j] + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] - b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] - b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] - b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR i := 0 TO 31 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". + [round_note] + +FOR i := 0 TO 31 + dst.fp16[i] := a.fp16[i] * b.fp16[i] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := a.fp16[i] * b.fp16[i] + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] * b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := a.fp16[0] * b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := a.fp16[0] * b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := src.fp16[2*i+0] + dst.fp16[2*i+1] := src.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := src.fp16[0] + dst.fp16[1] := src.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := c.fp16[0] + dst.fp16[1] := c.fp16[1] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := c.fp16[0] + dst.fp16[1] := c.fp16[1] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := a.fp16[2*i+0] + dst.fp16[2*i+1] := a.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := c.fp16[2*i+0] + dst.fp16[2*i+1] := c.fp16[2*i+1] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +FOR i := 0 to 15 + IF k[i] + dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] + dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] + ELSE + dst.fp16[2*i+0] := 0 + dst.fp16[2*i+1] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := c.fp16[0] + dst.fp16[1] := c.fp16[1] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] +dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := a.fp16[0] + dst.fp16[1] := a.fp16[1] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := c.fp16[0] + dst.fp16[1] := c.fp16[1] +FI +dst[127:32] := c[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + [round_note] + +IF k[0] + dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] + dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] +ELSE + dst.fp16[0] := 0 + dst.fp16[1] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := tmp.fp16[i] + a.fp16[i+16] +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := tmp.fp16[i] * a.fp16[i+16] +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8] +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] +ENDFOR +dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". [max_float_note] + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := (a.fp16[i] > a.fp16[i+16] ? a.fp16[i] : a.fp16[i+16]) +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] + +tmp := a +FOR i := 0 to 15 + tmp.fp16[i] := (a.fp16[i] < a.fp16[i+16] ? tmp.fp16[i] : a.fp16[i+16]) +ENDFOR +FOR i := 0 to 7 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) +ENDFOR +FOR i := 0 to 3 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) +ENDFOR +FOR i := 0 to 1 + tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) +ENDFOR +dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". + +FOR j := 0 to 31 + dst.fp16[j] := ABS(v2.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Arithmetic +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + CASE (imm8[3:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +FOR j := 0 to 31 + IF k1[j] + k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 + ELSE + k[j] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0 +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +IF k1[0] + k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] + CASE (imm8[4:0]) OF +0: OP := _CMP_EQ_OQ +1: OP := _CMP_LT_OS +2: OP := _CMP_LE_OS +3: OP := _CMP_UNORD_Q +4: OP := _CMP_NEQ_UQ +5: OP := _CMP_NLT_US +6: OP := _CMP_NLE_US +7: OP := _CMP_ORD_Q +8: OP := _CMP_EQ_UQ +9: OP := _CMP_NGE_US +10: OP := _CMP_NGT_US +11: OP := _CMP_FALSE_OQ +12: OP := _CMP_NEQ_OQ +13: OP := _CMP_GE_OS +14: OP := _CMP_GT_OS +15: OP := _CMP_TRUE_UQ +16: OP := _CMP_EQ_OS +17: OP := _CMP_LT_OQ +18: OP := _CMP_LE_OQ +19: OP := _CMP_UNORD_S +20: OP := _CMP_NEQ_US +21: OP := _CMP_NLT_UQ +22: OP := _CMP_NLE_UQ +23: OP := _CMP_ORD_S +24: OP := _CMP_EQ_US +25: OP := _CMP_NGE_UQ +26: OP := _CMP_NGT_UQ +27: OP := _CMP_FALSE_OS +28: OP := _CMP_NEQ_OS +29: OP := _CMP_GE_OQ +30: OP := _CMP_GT_OQ +31: OP := _CMP_TRUE_US +ESAC +RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] == b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] < b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] <= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] > b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] >= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). + RETURN ( a.fp16[0] ==NaN OR b.fp16[0] ==NaN OR a.fp16[0] != b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] == b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] < b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] <= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] > b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] >= b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + + Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a.fp16[0] ==NaN OR b.fp16[0] ==NaN OR a.fp16[0] != b.fp16[0] ) ? 1 : 0 + + + AVX512_FP16 +
immintrin.h
+ Compare +
+ + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst". + +dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst". + [round_note] + +dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_note] + +FOR j := 0 to 15 + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 15 + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 15 + IF k[j] + dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 7 + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := src.qword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 7 + IF k[j] + dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) + ELSE + dst.qword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". + [round_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". [sae_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". [sae_note] + +FOR j := 0 TO 31 + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := src.word[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 TO 31 + IF k[j] + dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) + ELSE + dst.word[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 7 + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := src.fp64[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 7 + IF k[j] + dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) + ELSE + dst.fp64[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 15 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] + +FOR j := 0 to 15 + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := src.fp32[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] + +FOR j := 0 to 15 + IF k[j] + dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) + ELSE + dst.fp32[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note] + +dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := src.fp64[0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := src.fp64[0] +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] + +IF k[0] + dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) +ELSE + dst.fp64[0] := 0 +FI +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := src.fp32[0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := src.fp32[0] +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + + Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] + +IF k[0] + dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) +ELSE + dst.fp32[0] := 0 +FI +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst.dword := Convert_FP16_To_Int32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + [round_note] + +dst.dword := Convert_FP16_To_Int32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst.qword := Convert_FP16_To_Int64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + [round_note] + +dst.qword := Convert_FP16_To_Int64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". + +dst.dword := Convert_FP16_To_UInt32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". [sae_note] + +dst.dword := Convert_FP16_To_UInt32(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". + +dst.qword := Convert_FP16_To_UInt64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". [round_note] + +dst.qword := Convert_FP16_To_UInt64(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". + +dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". + +dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". [sae_note] + +dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0]) + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + + Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy 16-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". + +dst.fp16[0] := a.fp16[0] +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower 16-bit integer in "a" to "dst". + +dst.fp16[0] := a.fp16[0] +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". + +dst[15:0] := a.fp16[0] + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". + +dst[15:0] := a.fp16[0] + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". + +dst[15:0] := a.fp16[0] + + AVX512_FP16 +
immintrin.h
+ Convert +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note] [min_float_note] + +FOR j := 0 to 31 + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := src.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) + ELSE + dst.fp16[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + + + + + Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +IF k[0] + dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Special Math Functions +
+ + + + Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst", and zero the upper elements. + +dst.fp16[0] := MEM[mem_addr].fp16[0] +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + + + Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. + +IF k[0] + dst.fp16[0] := MEM[mem_addr].fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + + Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. + +IF k[0] + dst.fp16[0] := MEM[mem_addr].fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[MAX:16] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[511:0] := MEM[mem_addr+511:mem_addr] +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Load +
+ + + + + Store the lower half-precision (16-bit) floating-point element from "a" into memory. + +MEM[mem_addr].fp16[0] := a.fp16[0] + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + + Store the lower half-precision (16-bit) floating-point element from "a" into memory using writemask "k". + +IF k[0] + MEM[mem_addr].fp16[0] := a.fp16[0] +FI + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+511:mem_addr] := a[511:0] + + + AVX512_FP16 +
immintrin.h
+ Store +
+ + + + + Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := b.fp16[0] +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Move +
+ + + + + + + Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := b.fp16[0] +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Move +
+ + + + + + Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := b.fp16[0] +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Move +
+ + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dest[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] + +DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { + m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) + RETURN tmp.fp16 +} +IF k[0] + dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dest[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 31 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] + FOR i := 0 to 31 + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] + FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ConvertExpFP16(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] + IF k[0] + dst.fp16[0] := ConvertExpFP16(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 31 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + FOR i := 0 TO 31 + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + FOR i := 0 TO 31 + IF k[i] + dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. + [getmant_note][sae_note] + IF k[0] + dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] + +DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { + m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved + tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) + tmp[15:0] := src[15:0] - tmp[15:0] + IF IsInf(tmp[15:0]) + tmp[15:0] := FP16(0.0) + FI + RETURN tmp[15:0] +} +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +FOR i := 0 to 15 + IF k[i] + dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + + Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + DEFINE ScaleFP16(src1, src2) { + denormal1 := (a.exp == 0) and (a.fraction != 0) + denormal2 := (b.exp == 0) and (b.fraction != 0) + tmp1 := src1 + tmp2 := src2 + IF MXCSR.DAZ + IF denormal1 + tmp1 := 0 + FI + IF denormal2 + tmp2 := 0 + FI + FI + RETURN tmp1 * POW(2.0, FLOOR(tmp2)) +} +IF k[0] + dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". + [fpclass_note] + FOR i := 0 to 31 + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). + [fpclass_note] + FOR i := 0 to 31 + IF k1[i] + k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) + ELSE + k[i] := 0 + FI +ENDFOR +k[MAX:32] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k". + [fpclass_note] + k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0]) +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). + [fpclass_note] + IF k1[0] + k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0]) +ELSE + k[0] := 0 +FI +k[MAX:1] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + off := idx[i+4:i] + dst.fp16[j] := idx[i+5] ? b.fp16[off] : a.fp16[off] +ENDFOR +dst[MAX:512] := 0 + + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + + Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". + +FOR j := 0 to 31 + IF k[j] + dst.fp16[j] := b.fp16[j] + ELSE + dst.fp16[j] := a.fp16[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + + Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*16 + id := idx[i+4:i] + dst.fp16[j] := a.fp16[id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Miscellaneous +
+ + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + +FOR i := 0 to 31 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". + [round_note] + +FOR i := 0 to 31 + dst.fp16[i] := SQRT(a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + [round_note] + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := SQRT(a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +dst.fp16[0] := SQRT(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst.fp16[0] := SQRT(b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". + [round_note] + +IF k[0] + dst.fp16[0] := SQRT(b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + dst.fp16[i] := (1.0 / a.fp16[i]) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := src.fp16[i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR i := 0 to 31 + IF k[i] + dst.fp16[i] := (1.0 / a.fp16[i]) + ELSE + dst.fp16[i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +dst.fp16[0] := (1.0 / b.fp16[0]) +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / b.fp16[0]) +ELSE + dst.fp16[0] := src.fp16[0] +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +IF k[0] + dst.fp16[0] := (1.0 / b.fp16[0]) +ELSE + dst.fp16[0] := 0 +FI +dst[127:16] := a[127:16] +dst[MAX:128] := 0 + + + AVX512_FP16 +
immintrin.h
+ Elementary Math Functions +
+ + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. + +dst.fp16[0] := e0 +dst.fp16[1] := e1 +dst.fp16[2] := e2 +dst.fp16[3] := e3 +dst.fp16[4] := e4 +dst.fp16[5] := e5 +dst.fp16[6] := e6 +dst.fp16[7] := e7 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. + +dst.fp16[0] := e0 +dst.fp16[1] := e1 +dst.fp16[2] := e2 +dst.fp16[3] := e3 +dst.fp16[4] := e4 +dst.fp16[5] := e5 +dst.fp16[6] := e6 +dst.fp16[7] := e7 +dst.fp16[8] := e8 +dst.fp16[9] := e9 +dst.fp16[10] := e10 +dst.fp16[11] := e11 +dst.fp16[12] := e12 +dst.fp16[13] := e13 +dst.fp16[14] := e14 +dst.fp16[15] := e15 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. + +dst.fp16[0] := e0 +dst.fp16[1] := e1 +dst.fp16[2] := e2 +dst.fp16[3] := e3 +dst.fp16[4] := e4 +dst.fp16[5] := e5 +dst.fp16[6] := e6 +dst.fp16[7] := e7 +dst.fp16[8] := e8 +dst.fp16[9] := e9 +dst.fp16[10] := e10 +dst.fp16[11] := e11 +dst.fp16[12] := e12 +dst.fp16[13] := e13 +dst.fp16[14] := e14 +dst.fp16[15] := e15 +dst.fp16[16] := e16 +dst.fp16[17] := e17 +dst.fp16[18] := e18 +dst.fp16[19] := e19 +dst.fp16[20] := e20 +dst.fp16[21] := e21 +dst.fp16[22] := e22 +dst.fp16[23] := e23 +dst.fp16[24] := e24 +dst.fp16[25] := e25 +dst.fp16[26] := e26 +dst.fp16[27] := e27 +dst.fp16[28] := e28 +dst.fp16[29] := e29 +dst.fp16[30] := e30 +dst.fp16[31] := e31 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst.fp16[0] := e7 +dst.fp16[1] := e6 +dst.fp16[2] := e5 +dst.fp16[3] := e4 +dst.fp16[4] := e3 +dst.fp16[5] := e2 +dst.fp16[6] := e1 +dst.fp16[7] := e0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst.fp16[0] := e15 +dst.fp16[1] := e14 +dst.fp16[2] := e13 +dst.fp16[3] := e12 +dst.fp16[4] := e11 +dst.fp16[5] := e10 +dst.fp16[6] := e9 +dst.fp16[7] := e8 +dst.fp16[8] := e7 +dst.fp16[9] := e6 +dst.fp16[10] := e5 +dst.fp16[11] := e4 +dst.fp16[12] := e3 +dst.fp16[13] := e2 +dst.fp16[14] := e1 +dst.fp16[15] := e0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst.fp16[0] := e31 +dst.fp16[1] := e30 +dst.fp16[2] := e29 +dst.fp16[3] := e28 +dst.fp16[4] := e27 +dst.fp16[5] := e26 +dst.fp16[6] := e25 +dst.fp16[7] := e24 +dst.fp16[8] := e23 +dst.fp16[9] := e22 +dst.fp16[10] := e21 +dst.fp16[11] := e20 +dst.fp16[12] := e19 +dst.fp16[13] := e18 +dst.fp16[14] := e17 +dst.fp16[15] := e16 +dst.fp16[16] := e15 +dst.fp16[17] := e14 +dst.fp16[18] := e13 +dst.fp16[19] := e12 +dst.fp16[20] := e11 +dst.fp16[21] := e10 +dst.fp16[22] := e9 +dst.fp16[23] := e8 +dst.fp16[24] := e7 +dst.fp16[25] := e6 +dst.fp16[26] := e5 +dst.fp16[27] := e4 +dst.fp16[28] := e3 +dst.fp16[29] := e2 +dst.fp16[30] := e1 +dst.fp16[31] := e0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". + +FOR i := 0 to 7 + dst.fp16[i] := a[15:0] +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". + +FOR i := 0 to 15 + dst.fp16[i] := a[15:0] +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". + +FOR i := 0 to 31 + dst.fp16[i] := a[15:0] +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst". + +FOR i := 0 to 3 + dst.fp16[2*i+0] := a[15:0] + dst.fp16[2*i+1] := a[31:16] +ENDFOR +dst[MAX:128] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst". + +FOR i := 0 to 7 + dst.fp16[2*i+0] := a[15:0] + dst.fp16[2*i+1] := a[31:16] +ENDFOR +dst[MAX:256] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst". + +FOR i := 0 to 15 + dst.fp16[2*i+0] := a[15:0] + dst.fp16[2*i+1] := a[31:16] +ENDFOR +dst[MAX:512] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Copy half-precision (16-bit) floating-point element "a" to the lower element of "dst", and zero the upper 7 elements. + +dst.fp16[0] := a[15:0] +dst[127:16] := 0 + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + Return vector of type __m512h with all elements set to zero. + +dst[MAX:0] := 0 + + + AVX512_FP16 +
immintrin.h
+ Set +
+ + + + Cast vector of type "__m128h" to type "__m128". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m256". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m512". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m128d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m256d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m512d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m128i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m256i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m512i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128d" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256d" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512d" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128i" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256i" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512i" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m512h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m256h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m128h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + + Cast vector of type "__m256h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + AVX512_FP16 +
immintrin.h
+ Cast +
+ + + Return vector of type __m512h with undefined elements. + AVX512_FP16 +
immintrin.h
+ General Support +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + +FOR i := 0 to 3 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 3 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 3 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + +FOR i := 0 to 1 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 1 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 1 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Bit Manipulation +
+ + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + id := idx[i+4:i]*8 + dst[i+7:i] := a[id+7:id] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + id := idx[i+4:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + id := idx[i+4:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + id := idx[i+3:i]*8 + dst[i+7:i] := a[id+7:id] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + id := idx[i+3:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + id := idx[i+3:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 31 + i := j*8 + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := idx[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*8 + IF k[j] + off := 8*idx[i+4:i] + dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := idx[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*8 + IF k[j] + off := 8*idx[i+3:i] + dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + + AVX512_VBMI + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". + +FOR i := 0 to 7 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Bit Manipulation +
+ + + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR i := 0 to 7 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Bit Manipulation +
+ + + + + + For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR i := 0 to 7 + q := i * 64 + FOR j := 0 to 7 + tmp8 := 0 + ctrl := a[q+j*8+7:q+j*8] & 63 + FOR l := 0 to 7 + tmp8[l] := b[q+((ctrl+l) & 63)] + ENDFOR + IF k[i*8+j] + dst[q+j*8+7:q+j*8] := tmp8[7:0] + ELSE + dst[q+j*8+7:q+j*8] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Bit Manipulation +
+ + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + id := idx[i+5:i]*8 + dst[i+7:i] := a[id+7:id] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + id := idx[i+5:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + id := idx[i+5:i]*8 + IF k[j] + dst[i+7:i] := a[id+7:id] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". + +FOR j := 0 to 63 + i := j*8 + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := idx[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 63 + i := j*8 + IF k[j] + off := 8*idx[i+5:i] + dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + + AVX512_VBMI +
immintrin.h
+ Swizzle +
+ + + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". + +FOR j := 0 to 15 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 1 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + +FOR j := 0 to 7 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + +FOR j := 0 to 3 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + +FOR j := 0 to 15 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Shift +
+ + Swizzle + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Load +
+ + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 16 +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 16 +m := 0 +FOR j := 0 to 15 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 16 +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 16 +m := 0 +FOR j := 0 to 7 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 8 +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[255:m] := 0 +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 8 +m := 0 +FOR j := 0 to 31 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[255:m] := src[255:m] +dst[MAX:256] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 8 +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[127:m] := 0 +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 8 +m := 0 +FOR j := 0 to 15 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[127:m] := src[127:m] +dst[MAX:128] := 0 + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Swizzle +
+ + Swizzle + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 16 +m := base_addr +FOR j := 0 to 15 + i := j*16 + IF k[j] + MEM[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 16 +m := base_addr +FOR j := 0 to 7 + i := j*16 + IF k[j] + MEM[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 8 +m := base_addr +FOR j := 0 to 31 + i := j*8 + IF k[j] + MEM[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 8 +m := base_addr +FOR j := 0 to 15 + i := j*8 + IF k[j] + MEM[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 + AVX512VL +
immintrin.h
+ Store +
+ + + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". + +FOR j := 0 to 7 + i := j*64 + dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". + +FOR j := 0 to 31 + i := j*16 + dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + i := j*64 + IF k[j] + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] + ELSE + dst[i+63:i] := src[i+63:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). + +FOR j := 0 to 7 + i := j*64 + tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] + dst[i+63:i] := tmp[127:64] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + i := j*32 + IF k[j] + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] + ELSE + dst[i+31:i] := src[i+31:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". + +FOR j := 0 to 15 + i := j*32 + tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] + dst[i+31:i] := tmp[63:32] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 31 + i := j*16 + IF k[j] + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + + + + + Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). + +FOR j := 0 to 31 + i := j*16 + tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] + dst[i+15:i] := tmp[31:16] +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Shift +
+ + Swizzle + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + Swizzle + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + Swizzle + + + + + Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Load +
+ + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[i+15:i] := a[m+15:m] + m := m + 16 + ELSE + dst[i+15:i] := src[i+15:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[i+7:i] := a[m+7:m] + m := m + 8 + ELSE + dst[i+7:i] := src[i+7:i] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 16 +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 16 +m := 0 +FOR j := 0 to 31 + i := j*16 + IF k[j] + dst[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. + +size := 8 +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[511:m] := 0 +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". + +size := 8 +m := 0 +FOR j := 0 to 63 + i := j*8 + IF k[j] + dst[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR +dst[511:m] := src[511:m] +dst[MAX:512] := 0 + + + AVX512_VBMI2 +
immintrin.h
+ Swizzle +
+ + Swizzle + + + + + Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 16 +m := base_addr +FOR j := 0 to 31 + i := j*16 + IF k[j] + MEM[m+size-1:m] := a[i+15:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 +
immintrin.h
+ Store +
+ + Swizzle + + + + + Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". + +size := 8 +m := base_addr +FOR j := 0 to 63 + i := j*8 + IF k[j] + MEM[m+size-1:m] := a[i+7:i] + m := m + size + FI +ENDFOR + + + AVX512_VBMI2 +
immintrin.h
+ Store +
+ + + + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 7 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 3 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + AVX512_VNNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +FOR j := 0 to 15 + IF k[j] + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 + ELSE + dst.dword[j] := src.dword[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 15 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:512] := 0 + + + AVX512_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + + + + Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+15:k1] := 0 +MEM[k2+15:k2] := 0 +FOR i := 0 TO 15 + FOR j := 0 TO 15 + match := (a.dword[i] == b.dword[j] ? 1 : 0) + MEM[k1+15:k1].bit[i] |= match + MEM[k2+15:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512F +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 7 + FOR j := 0 TO 7 + match := (a.qword[i] == b.qword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512F +
immintrin.h
+ Mask +
+ + + + + + + + + Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 3 + FOR j := 0 TO 3 + match := (a.dword[i] == b.dword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 7 + FOR j := 0 TO 7 + match := (a.dword[i] == b.dword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 1 + FOR j := 0 TO 1 + match := (a.qword[i] == b.qword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + + + Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. + +MEM[k1+7:k1] := 0 +MEM[k2+7:k2] := 0 +FOR i := 0 TO 3 + FOR j := 0 TO 3 + match := (a.qword[i] == b.qword[j] ? 1 : 0) + MEM[k1+7:k1].bit[i] |= match + MEM[k2+7:k2].bit[j] |= match + ENDFOR +ENDFOR + + + AVX512_VP2INTERSECT + AVX512VL +
immintrin.h
+ Mask +
+ + + + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 3 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". + + +FOR j := 0 to 1 + i := j*64 + tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) + dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_IFMA +
immintrin.h
+ Arithmetic +
+ + + + Convert scalar BF16 (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +b := Convert_BF16_To_FP32(MEM[__A+15:__A]) +FOR j := 0 to 7 + m := j*32 + dst[m+31:m] := b +ENDFOR +dst[MAX:256] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +b := Convert_FP16_To_FP32(MEM[__A+15:__A]) +FOR j := 0 to 7 + m := j*32 + dst[m+31:m] := b +ENDFOR +dst[MAX:256] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 7 + m := j*32 + dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+15:__A+m]) +ENDFOR +dst[MAX:256] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 7 + m := j*32 + dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+15:__A+m]) +ENDFOR +dst[MAX:256] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 7 + m := j*32 + dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+31:__A+m+16]) +ENDFOR +dst[MAX:256] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 7 + m := j*32 + dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+31:__A+m+16]) +ENDFOR +dst[MAX:256] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 7 + dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert scalar BF16 (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +b := Convert_BF16_To_FP32(MEM[__A+15:__A]) +FOR j := 0 to 3 + m := j*32 + dst[m+31:m] := b +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +b := Convert_FP16_To_FP32(MEM[__A+15:__A]) +FOR j := 0 to 3 + m := j*32 + dst[m+31:m] := b +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 3 + m := j*32 + dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+15:__A+m]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 3 + m := j*32 + dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+15:__A+m]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 3 + m := j*32 + dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+31:__A+m+16]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 3 + m := j*32 + dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+31:__A+m+16]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 3 + dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 7 + dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". + + +FOR j := 0 to 3 + dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) +ENDFOR +dst[MAX:128] := 0 + + + AVX_NE_CONVERT +
immintrin.h
+ Convert +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) + tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) + tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) + tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := src.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) + tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) + dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + AVX_VNNI +
immintrin.h
+ Arithmetic +
+ + + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:256] := 0 + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) + tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) + dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI_INT16 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) + tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) + tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) + tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) + tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) + tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) + tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) + tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) + tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) + tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) + tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) + tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) + tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) + tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) + tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) + tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:256] := 0 + + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with unsigned saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 7 + tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) + tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) + tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) + tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) + dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:256] := 0 + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) + tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) + tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) + tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) + tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) + tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) + tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) + tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) + tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) + tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) + tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) + tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) + tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) + dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) + tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) + tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) + tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) + dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +ENDFOR +dst[MAX:128] := 0 + + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with unsigned saturation, and store the packed 32-bit results in "dst". + + +FOR j := 0 to 3 + tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) + tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) + tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) + tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) + dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +ENDFOR +dst[MAX:128] := 0 + + + + AVX_VNNI_INT8 +
immintrin.h
+ Arithmetic +
+ + + + + + + Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". + +tmp[511:0] := a +dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control". + +start := control[7:0] +len := control[15:8] +tmp[511:0] := a +dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + + Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". + +tmp[511:0] := a +dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".. + +start := control[7:0] +len := control[15:8] +tmp[511:0] := a +dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". + +dst := (-a) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". + +dst := (-a) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a". + +dst := (a - 1) XOR a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a". + +dst := (a - 1) XOR a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". + +dst := (a - 1) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". + +dst := (a - 1) AND a + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst. + +dst[31:0] := ((NOT a[31:0]) AND b[31:0]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst. + +dst[63:0] := ((NOT a[63:0]) AND b[63:0]) + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 16-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 16) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 64) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". + +tmp := 0 +dst := 0 +DO WHILE ((tmp < 64) AND a[tmp] == 0) + tmp := tmp + 1 + dst := dst + 1 +OD + + + BMI1 +
immintrin.h
+ Bit Manipulation +
+ + + + + + + Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". + +n := index[7:0] +dst := a +IF (n < 32) + dst[31:n] := 0 +FI + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". + +n := index[7:0] +dst := a +IF (n < 64) + dst[63:n] := 0 +FI + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 32 + IF mask[m] == 1 + dst[m] := tmp[k] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 64 + IF mask[m] == 1 + dst[m] := tmp[k] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 32 + IF mask[m] == 1 + dst[k] := tmp[m] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. + +tmp := a +dst := 0 +m := 0 +k := 0 +DO WHILE m < 64 + IF mask[m] == 1 + dst[k] := tmp[m] + k := k + 1 + FI + m := m + 1 +OD + + + BMI2 +
immintrin.h
+ Bit Manipulation +
+ + + + + + Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags. + +dst[31:0] := (a * b)[31:0] +MEM[hi+31:hi] := (a * b)[63:32] + + + BMI2 +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags. + +dst[63:0] := (a * b)[63:0] +MEM[hi+63:hi] := (a * b)[127:64] + + + BMI2 +
immintrin.h
+ Arithmetic +
+ + + + + + Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". + +SSP := SSP + a[7:0] * 4 + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Increment the shadow stack pointer by 8 times the value specified in bits [7:0] of "a". + +SSP := SSP + a[7:0] * 8 + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Read the low 32-bits of the current shadow stack pointer, and store the result in "dst". + dst := SSP[31:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Read the current shadow stack pointer, and store the result in "dst". + dst := SSP[63:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Save the previous shadow stack pointer context. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Restore the saved shadow stack pointer from the shadow stack restore token previously created on shadow stack by saveprevssp. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 32-bit value in "val" to a shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 64-bit value in "val" to a shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 32-bit value in "val" to a user shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Write 64-bit value in "val" to a user shadow stack page in memory specified by "p". + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Mark shadow stack pointed to by IA32_PL0_SSP as busy. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Mark shadow stack pointed to by "p" as not busy. + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + If CET is enabled, read the low 32-bits of the current shadow stack pointer, and store the result in "dst". Otherwise return 0. + dst := SSP[31:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + If CET is enabled, read the current shadow stack pointer, and store the result in "dst". Otherwise return 0. + dst := SSP[63:0] + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". + +SSP := SSP + a[7:0] * 4 + + + CET_SS +
immintrin.h
+ Miscellaneous +
+ + + + + Hint to hardware that the cache line that contains "p" should be demoted from the cache closest to the processor core to a level more distant from the processor core. + + CLDEMOTE +
immintrin.h
+ Miscellaneous +
+ + + + + + Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. + + CLFLUSHOPT +
immintrin.h
+ General Support +
+ + + + + + Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain. + + CLWB +
immintrin.h
+ General Support +
+ + + + + + + + + Compares the value from the memory "__A" with the value of "__B". If the specified condition "__D" is met, then add the third operand "__C" to the "__A" and write it into "__A", else the value of "__A" is unchanged. The return value is the original value of "__A". + CASE (__D[3:0]) OF +0: OP := _CMPCCX_O +1: OP := _CMPCCX_NO +2: OP := _CMPCCX_B +3: OP := _CMPCCX_NB +4: OP := _CMPCCX_Z +5: OP := _CMPCCX_NZ +6: OP := _CMPCCX_BE +7: OP := _CMPCCX_NBE +8: OP := _CMPCCX_S +9: OP := _CMPCCX_NS +10: OP := _CMPCCX_P +11: OP := _CMPCCX_NP +12: OP := _CMPCCX_L +13: OP := _CMPCCX_NL +14: OP := _CMPCCX_LE +15: OP := _CMPCCX_NLE +ESAC +tmp1 := LOAD_LOCK(__A) +tmp2 := tmp1 + __C +IF (tmp1[31:0] OP __B[31:0]) + STORE_UNLOCK(__A, tmp2) +ELSE + STORE_UNLOCK(__A, tmp1) +FI +dst[31:0] := tmp1[31:0] + + + + + + + + + + + + + + + + + + CMPCCXADD +
immintrin.h
+ Arithmetic +
+ + + + + + + Compares the value from the memory "__A" with the value of "__B". If the specified condition "__D" is met, then add the third operand "__C" to the "__A" and write it into "__A", else the value of "__A" is unchanged. The return value is the original value of "__A". + CASE (__D[3:0]) OF +0: OP := _CMPCCX_O +1: OP := _CMPCCX_NO +2: OP := _CMPCCX_B +3: OP := _CMPCCX_NB +4: OP := _CMPCCX_Z +5: OP := _CMPCCX_NZ +6: OP := _CMPCCX_BE +7: OP := _CMPCCX_NBE +8: OP := _CMPCCX_S +9: OP := _CMPCCX_NS +10: OP := _CMPCCX_P +11: OP := _CMPCCX_NP +12: OP := _CMPCCX_L +13: OP := _CMPCCX_NL +14: OP := _CMPCCX_LE +15: OP := _CMPCCX_NLE +ESAC +tmp1 := LOAD_LOCK(__A) +tmp2 := tmp1 + __C +IF (tmp1[63:0] OP __B[63:0]) + STORE_UNLOCK(__A, tmp2) +ELSE + STORE_UNLOCK(__A, tmp1) +FI +dst[63:0] := tmp1[63:0] + + + + + + + + + + + + + + + + + + CMPCCXADD +
immintrin.h
+ Arithmetic +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst". + tmp1[7:0] := v[0:7] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[39:0] := tmp1[7:0] << 32 +tmp4[39:0] := tmp2[31:0] << 8 +tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0] +tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + CRC32 +
nmmintrin.h
+ Cryptography +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst". + tmp1[15:0] := v[0:15] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[47:0] := tmp1[15:0] << 32 +tmp4[47:0] := tmp2[31:0] << 16 +tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0] +tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + CRC32 +
nmmintrin.h
+ Cryptography +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst". + tmp1[31:0] := v[0:31] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[63:0] := tmp1[31:0] << 32 +tmp4[63:0] := tmp2[31:0] << 32 +tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0] +tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + CRC32 +
nmmintrin.h
+ Cryptography +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst". + tmp1[63:0] := v[0:63] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[95:0] := tmp1[31:0] << 32 +tmp4[95:0] := tmp2[63:0] << 64 +tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0] +tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + CRC32 +
nmmintrin.h
+ Cryptography +
+ + + + + + + Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst". This intrinsics may only be used in User mode. + + ENQCMD +
immintrin.h
+ Unknown +
+ + + + + Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst" This intrinsic may only be used in Privileged mode. + + ENQCMD +
immintrin.h
+ Unknown +
+ + + + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:256] := 0 + + + F16C +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_imm_note] + +FOR j := 0 to 7 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:128] := 0 + + + F16C +
immintrin.h
+ Convert +
+ + + + Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + m := j*16 + dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) +ENDFOR +dst[MAX:128] := 0 + + + F16C +
immintrin.h
+ Convert +
+ + + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". + [round_imm_note] + +FOR j := 0 to 3 + i := 16*j + l := 32*j + dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) +ENDFOR +dst[MAX:64] := 0 + + + F16C +
immintrin.h
+ Convert +
+ + + + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] + ELSE + dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] + ELSE + dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] + FI +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*64 + dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". + +FOR j := 0 to 7 + i := j*32 + dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] +ENDFOR +dst[MAX:256] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + + + FMA +
immintrin.h
+ Arithmetic +
+ + + + + Read the FS segment base register and store the 32-bit result in "dst". + dst[31:0] := FS_Segment_Base_Register +dst[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + Read the FS segment base register and store the 64-bit result in "dst". + dst[63:0] := FS_Segment_Base_Register + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + Read the GS segment base register and store the 32-bit result in "dst". + dst[31:0] := GS_Segment_Base_Register +dst[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + Read the GS segment base register and store the 64-bit result in "dst". + dst[63:0] := GS_Segment_Base_Register + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 32-bit integer "a" to the FS segment base register. + +FS_Segment_Base_Register[31:0] := a[31:0] +FS_Segment_Base_Register[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 64-bit integer "a" to the FS segment base register. + +FS_Segment_Base_Register[63:0] := a[63:0] + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 32-bit integer "a" to the GS segment base register. + +GS_Segment_Base_Register[31:0] := a[31:0] +GS_Segment_Base_Register[63:32] := 0 + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + Write the unsigned 64-bit integer "a" to the GS segment base register. + +GS_Segment_Base_Register[63:0] := a[63:0] + + + FSGSBASE +
immintrin.h
+ General Support +
+ + + + + + Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. + state_x87_fpu_mmx_sse := fxrstor(MEM[mem_addr+512*8:mem_addr]) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. + state_x87_fpu_mmx_sse := fxrstor64(MEM[mem_addr+512*8:mem_addr]) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. + MEM[mem_addr+512*8:mem_addr] := fxsave(state_x87_fpu_mmx_sse) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. + MEM[mem_addr+512*8:mem_addr] := fxsave64(state_x87_fpu_mmx_sse) + + + FXSR +
immintrin.h
+ OS-Targeted +
+ + + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 63 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := 0 + FI +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 63 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := src.byte[j] + FI +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 63 + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[b] + FI + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 7 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:512] := 0 + + + GFNI + AVX512F +
immintrin.h
+ Arithmetic +
+ + + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 31 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := 0 + FI +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 31 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := src.byte[j] + FI +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 31 + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 15 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := 0 + FI +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 15 + IF k[j] + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) + ELSE + dst.byte[j] := src.byte[j] + FI +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. + +DEFINE gf2p8mul_byte(src1byte, src2byte) { + tword := 0 + FOR i := 0 to 7 + IF src2byte.bit[i] + tword := tword XOR (src1byte << i) + FI + ENDFOR + FOR i := 14 downto 8 + p := 0x11B << (i-8) + IF tword.bit[i] + tword := tword XOR p + FI + ENDFOR + RETURN tword.byte[0] +} +FOR j := 0 TO 15 + dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". + +DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 3 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:256] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := 0 + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + IF k[j*8+i] + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ELSE + dst.qword[j].byte[i] := src.qword[j].byte[i] + FI + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". + DEFINE parity(x) { + t := 0 + FOR i := 0 to 7 + t := t XOR x.bit[i] + ENDFOR + RETURN t +} +DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { + FOR i := 0 to 7 + retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] + ENDFOR + RETURN retbyte +} +FOR j := 0 TO 1 + FOR i := 0 to 7 + dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) + ENDFOR +ENDFOR +dst[MAX:128] := 0 + + + GFNI + AVX512VL +
immintrin.h
+ Arithmetic +
+ + + + + + Provides a hint to the processor to selectively reset the prediction history of the current logical processor specified by a signed 32-bit integer "__eax". + + HRESET +
immintrin.h
+ General Support +
+ + + + + + Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". + The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved. + The types supported are: + 0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs. + 1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well. + 2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID. + 3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well. + +CASE type[1:0] OF +0: // individual-address invalidation retaining global translations + OP_PCID := MEM[descriptor+11:descriptor] + ADDR := MEM[descriptor+127:descriptor+64] + BREAK +1: // single PCID invalidation retaining globals + OP_PCID := MEM[descriptor+11:descriptor] + // invalidate all mappings tagged with OP_PCID except global translations + BREAK +2: // all PCID invalidation + // invalidate all mappings tagged with any PCID + BREAK +3: // all PCID invalidation retaining global translations + // invalidate all mappings tagged with any PCID except global translations + BREAK +ESAC + + + INVPCID +
immintrin.h
+ OS-Targeted +
+ + + + Flag + + + + + Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], __h[383:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], __h[511:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. + MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], __h[383:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], __h[511:0]) +dst := ZF + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Wrap a 128-bit AES key from "__key" into a 384-bit key __h stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The explicit source operand "__htype" specifies __h restrictions. + __h[383:0] := WrapKey128(__key[127:0], __htype) +dst[0] := IWKey.NoBackup +dst[4:1] := IWKey.KeySource[3:0] + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + + Wrap a 256-bit AES key from "__key_hi" and "__key_lo" into a 512-bit key stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The 32-bit "__htype" specifies __h restrictions. + __h[511:0] := WrapKey256(__key_lo[127:0], __key_hi[127:0], __htype) +dst[0] := IWKey.NoBackup +dst[4:1] := IWKey.KeySource[3:0] + + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + + Load internal wrapping key (IWKey). The 32-bit unsigned integer "__ctl" specifies IWKey's KeySource and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from "__enkey_lo" and "__enkey_hi". IWKey's 128-bit integrity key is loaded from "__intkey". + + KEYLOCKER +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES128Decrypt (__idata[i], __h[383:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES256Decrypt (__idata[i], __h[511:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES128Encrypt (__idata[i], __h[383:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + Flag + + + + + Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". + FOR i := 0 to 7 + __odata[i] := AES256Encrypt (__idata[i], __h[512:0]) +ENDFOR +dst := ZF + + + KEYLOCKER_WIDE +
immintrin.h
+ Cryptography +
+ + + + + Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst". + +tmp := 31 +dst := 0 +DO WHILE (tmp >= 0 AND a[tmp] == 0) + tmp := tmp - 1 + dst := dst + 1 +OD + + + LZCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst". + +tmp := 63 +dst := 0 +DO WHILE (tmp >= 0 AND a[tmp] == 0) + tmp := tmp - 1 + dst := dst + 1 +OD + + + LZCNT +
immintrin.h
+ Bit Manipulation +
+ + + + + + Copy 64-bit integer "a" to "dst". + +dst[63:0] := a[63:0] + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to "dst". + +dst[63:0] := a[63:0] + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". + +dst[31:0] := a[31:0] +dst[63:32] := 0 + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". + +dst[31:0] := a[31:0] +dst[63:32] := 0 + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to "dst". + +dst[63:0] := a[63:0] + + + MMX +
mmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to "dst". + +dst[63:0] := a[63:0] + + + MMX +
mmintrin.h
+ Convert +
+ + + + Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. + + MMX +
mmintrin.h
+ General Support +
+ + + + Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. + + MMX +
mmintrin.h
+ General Support +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". + +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(b[15:0]) +dst[47:40] := Saturate8(b[31:16]) +dst[55:48] := Saturate8(b[47:32]) +dst[63:56] := Saturate8(b[63:48]) + + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". + +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(b[31:0]) +dst[63:48] := Saturate16(b[63:32]) + + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(b[15:0]) +dst[47:40] := SaturateU8(b[31:16]) +dst[55:48] := SaturateU8(b[47:32]) +dst[63:56] := SaturateU8(b[63:48]) + + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". + +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(b[15:0]) +dst[47:40] := Saturate8(b[31:16]) +dst[55:48] := Saturate8(b[47:32]) +dst[63:56] := Saturate8(b[63:48]) + + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". + +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(b[31:0]) +dst[63:48] := Saturate16(b[63:32]) + + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(b[15:0]) +dst[47:40] := SaturateU8(b[31:16]) +dst[55:48] := SaturateU8(b[47:32]) +dst[63:56] := SaturateU8(b[63:48]) + + + MMX +
mmintrin.h
+ Miscellaneous +
+ + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[39:32] + dst[15:8] := src2[39:32] + dst[23:16] := src1[47:40] + dst[31:24] := src2[47:40] + dst[39:32] := src1[55:48] + dst[47:40] := src2[55:48] + dst[55:48] := src1[63:56] + dst[63:56] := src2[63:56] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[47:32] + dst[31:16] := src2[47:32] + dst[47:32] := src1[63:48] + dst[63:48] := src2[63:48] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". + +dst[31:0] := a[63:32] +dst[63:32] := b[63:32] + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". + +dst[31:0] := a[31:0] +dst[63:32] := b[31:0] + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[39:32] + dst[15:8] := src2[39:32] + dst[23:16] := src1[47:40] + dst[31:24] := src2[47:40] + dst[39:32] := src1[55:48] + dst[47:40] := src2[55:48] + dst[55:48] := src1[63:56] + dst[63:56] := src2[63:56] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[47:32] + dst[31:16] := src2[47:32] + dst[47:32] := src1[63:48] + dst[63:48] := src2[63:48] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". + +dst[31:0] := a[63:32] +dst[63:32] := b[63:32] + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + RETURN dst[63:0] +} +dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". + +dst[31:0] := a[31:0] +dst[63:32] := b[31:0] + + + MMX +
mmintrin.h
+ Swizzle +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[15:0] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[15:0] +ENDFOR + + + MMX +
mmintrin.h
+ Arithmetic +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". + +IF count[63:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". + +IF imm8[7:0] > 63 + dst[63:0] := 0 +ELSE + dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) +FI + + + MMX +
mmintrin.h
+ Shift +
+ + + + + Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[63:0] := ((NOT a[63:0]) AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] OR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] XOR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[63:0] := ((NOT a[63:0]) AND b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] OR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[63:0] := (a[63:0] XOR b[63:0]) + + + MMX +
mmintrin.h
+ Logical +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + MMX +
mmintrin.h
+ Compare +
+ + + + Return vector of type __m64 with all elements set to zero. + +dst[MAX:0] := 0 + + + MMX +
mmintrin.h
+ Set +
+ + + + + Set packed 32-bit integers in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 + + MMX +
mmintrin.h
+ Set +
+ + + + + + + Set packed 16-bit integers in "dst" with the supplied values. + +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 + + MMX +
mmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. + +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 + + MMX +
mmintrin.h
+ Set +
+ + + + Broadcast 32-bit integer "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR + + MMX +
mmintrin.h
+ Set +
+ + + + Broadcast 16-bit integer "a" to all all elements of "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR + + MMX +
mmintrin.h
+ Set +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR + + MMX +
mmintrin.h
+ Set +
+ + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e1 +dst[63:32] := e0 + + MMX +
mmintrin.h
+ Set +
+ + + + + + + Set packed 16-bit integers in "dst" with the supplied values in reverse order. + +dst[15:0] := e3 +dst[31:16] := e2 +dst[47:32] := e1 +dst[63:48] := e0 + + MMX +
mmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values in reverse order. + +dst[7:0] := e7 +dst[15:8] := e6 +dst[23:16] := e5 +dst[31:24] := e4 +dst[39:32] := e3 +dst[47:40] := e2 +dst[55:48] := e1 +dst[63:56] := e0 + + MMX +
mmintrin.h
+ Set +
+ + + + + + + + Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints". + + MONITOR +
pmmintrin.h
+ General Support +
+ + + + + Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR. + + MONITOR +
pmmintrin.h
+ General Support +
+ + + + + + Load 16 bits from memory, perform a byte swap operation, and store the result in "dst". + +FOR j := 0 to 1 + i := j*8 + dst[i+7:i] := MEM[ptr+15-i:ptr+8-i] +ENDFOR + + + MOVBE +
immintrin.h
+ Load +
+ + + + Load 32 bits from memory, perform a byte swap operation, and store the result in "dst". + +FOR j := 0 to 3 + i := j*8 + dst[i+7:i] := MEM[ptr+31-i:ptr+24-i] +ENDFOR + + + MOVBE +
immintrin.h
+ Load +
+ + + + Load 64 bits from memory, perform a byte swap operation, and store the result in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MEM[ptr+63-i:ptr+56-i] +ENDFOR + + + MOVBE +
immintrin.h
+ Load +
+ + + + + Perform a bit swap operation of the 16 bits in "data", and store the results to memory. + +FOR j := 0 to 1 + i := j*8 + MEM[ptr+i+7:ptr+i] := data[15-i:8-i] +ENDFOR + + + MOVBE +
immintrin.h
+ Store +
+ + + + + Perform a bit swap operation of the 32 bits in "data", and store the results to memory. + +addr := MEM[ptr] +FOR j := 0 to 3 + i := j*8 + MEM[ptr+i+7:ptr+i] := data[31-i:24-i] +ENDFOR + + + MOVBE +
immintrin.h
+ Store +
+ + + + + Perform a bit swap operation of the 64 bits in "data", and store the results to memory. + +addr := MEM[ptr] +FOR j := 0 to 7 + i := j*8 + MEM[ptr+i+7:ptr+i] := data[63-i:56-i] +ENDFOR + + + MOVBE +
immintrin.h
+ Store +
+ + + + + + + Move 64-byte (512-bit) value using direct store from source memory address "src" to destination memory address "dst". + +MEM[dst+511:dst] := MEM[src+511:src] + + + MOVDIR64B +
immintrin.h
+ Store +
+ + + + + + + Store 64-bit integer from "val" into memory using direct store. + +MEM[dst+63:dst] := val[63:0] + + + MOVDIRI +
immintrin.h
+ Store +
+ + + + + Store 32-bit integer from "val" into memory using direct store. + +MEM[dst+31:dst] := val[31:0] + + + MOVDIRI +
immintrin.h
+ Store +
+ + + + + + + Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst". + dst := srcmem +dst.LB := srcmem.LB +dst.UB := srcmem + size - 1 + + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + + + Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst". + dst := q +IF r.LB > (q + size - 1) OR r.UB < q + dst.LB := 1 + dst.UB := 0 +ELSE + dst.LB := MAX(r.LB, q) + dst.UB := MIN(r.UB, (q + size - 1)) +FI + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + + Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst". + dst := q +dst.LB := r.LB +dst.UB := r.UB + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst". + dst := q +dst.LB := 0 +dst.UB := 0 + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + + Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr". + MEM[ptr_addr].LB := ptr_val.LB +MEM[ptr_addr].UB := ptr_val.UB + + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + Checks if "q" is within its lower bound, and throws a #BR if not. + IF q < q.LB + #BR +FI + + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + Checks if "q" is within its upper bound, and throws a #BR if not. + IF q > q.UB + #BR +FI + + + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + + Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not. + IF (q + size - 1) < q.LB OR (q + size - 1) > q.UB + #BR +FI + + + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + Return the lower bound of "q". + dst := q.LB + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + Return the upper bound of "q". + dst := q.UB + + MPX +
immintrin.h
+ Miscellaneous + +
+ + + + + Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. + +tmp := 0 +IF a == 0 + // dst is undefined +ELSE + DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + OD +FI +dst := tmp + + +
immintrin.h
+ Bit Manipulation +
+ + + + Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. + +tmp := 31 +IF a == 0 + // dst is undefined +ELSE + DO WHILE ((tmp > 0) AND a[tmp] == 0) + tmp := tmp - 1 + OD +FI +dst := tmp + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + +tmp := 0 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp < 32) AND a[tmp] == 0) + tmp := tmp + 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 31) ? 0 : 1 +FI + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + +tmp := 31 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp > 0) AND a[tmp] == 0) + tmp := tmp - 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 0) ? 0 : 1 +FI + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + +tmp := 0 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp < 64) AND a[tmp] == 0) + tmp := tmp + 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 63) ? 0 : 1 +FI + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. + +tmp := 63 +IF a == 0 + // MEM[index+31:index] is undefined + dst := 0 +ELSE + DO WHILE ((tmp > 0) AND a[tmp] == 0) + tmp := tmp - 1 + OD + MEM[index+31:index] := tmp + dst := (tmp == 0) ? 0 : 1 +FI + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a". + +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement. + +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] +MEM[addr] := ~dst[0] + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a", and set that bit to zero. + +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] +MEM[addr] := 0 + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 32-bit integer "a", and set that bit to one. + +addr := a + ZeroExtend64(b) +dst[0] := MEM[addr] +MEM[addr] := 1 + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a". + +addr := a + b +dst[0] := MEM[addr] + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement. + +addr := a + b +dst[0] := MEM[addr] +MEM[addr] := ~dst[0] + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a", and set that bit to zero. + +addr := a + b +dst[0] := MEM[addr] +MEM[addr] := 0 + + +
immintrin.h
+ Bit Manipulation +
+ + + + + Return the bit at index "b" of 64-bit integer "a", and set that bit to one. + +addr := a + b +dst[0] := MEM[addr] +MEM[addr] := 1 + + +
immintrin.h
+ Bit Manipulation +
+ + + + Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. + +dst[7:0] := a[31:24] +dst[15:8] := a[23:16] +dst[23:16] := a[15:8] +dst[31:24] := a[7:0] + + +
immintrin.h
+ Bit Manipulation +
+ + + + Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. + +dst[7:0] := a[63:56] +dst[15:8] := a[55:48] +dst[23:16] := a[47:40] +dst[31:24] := a[39:32] +dst[39:32] := a[31:24] +dst[47:40] := a[23:16] +dst[55:48] := a[15:8] +dst[63:56] := a[7:0] + + +
immintrin.h
+ Bit Manipulation +
+ + + + Cast from type float to type unsigned __int32 without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + Cast from type double to type unsigned __int64 without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + Cast from type unsigned __int32 to type float without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + Cast from type unsigned __int64 to type double without conversion. + This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. +
immintrin.h
+ Cast +
+ + + + + Shift the bits of unsigned long integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + // size := 32 or 64 +dst := a +count := shift AND (size - 1) +DO WHILE (count > 0) + tmp[0] := dst[size - 1] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD + + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned long integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + // size := 32 or 64 +dst := a +count := shift AND (size - 1) +DO WHILE (count > 0) + tmp[size - 1] := dst[0] + dst := (dst >> 1) OR tmp[size - 1] + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + +dst := a +count := shift AND 31 +DO WHILE (count > 0) + tmp[0] := dst[31] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + +dst := a +count := shift AND 31 +DO WHILE (count > 0) + tmp[31] := dst[0] + dst := (dst >> 1) OR tmp + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + +dst := a +count := shift AND 15 +DO WHILE (count > 0) + tmp[0] := dst[15] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + +dst := a +count := shift AND 15 +DO WHILE (count > 0) + tmp[15] := dst[0] + dst := (dst >> 1) OR tmp + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". + +dst := a +count := shift AND 63 +DO WHILE (count > 0) + tmp[0] := dst[63] + dst := (dst << 1) OR tmp[0] + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + + Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". + +dst := a +count := shift AND 63 +DO WHILE (count > 0) + tmp[63] := dst[0] + dst := (dst >> 1) OR tmp[63] + count := count - 1 +OD + + +
immintrin.h
+ Shift +
+ + + + Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below: + +_FEATURE_GENERIC_IA32 +_FEATURE_FPU +_FEATURE_CMOV +_FEATURE_MMX +_FEATURE_FXSAVE +_FEATURE_SSE +_FEATURE_SSE2 +_FEATURE_SSE3 +_FEATURE_SSSE3 +_FEATURE_SSE4_1 +_FEATURE_SSE4_2 +_FEATURE_MOVBE +_FEATURE_POPCNT +_FEATURE_PCLMULQDQ +_FEATURE_AES +_FEATURE_F16C +_FEATURE_AVX +_FEATURE_RDRND +_FEATURE_FMA +_FEATURE_BMI +_FEATURE_LZCNT +_FEATURE_HLE +_FEATURE_RTM +_FEATURE_AVX2 +_FEATURE_KNCNI +_FEATURE_AVX512F +_FEATURE_ADX +_FEATURE_RDSEED +_FEATURE_AVX512ER +_FEATURE_AVX512PF +_FEATURE_AVX512CD +_FEATURE_SHA +_FEATURE_MPX +_FEATURE_AVX512BW +_FEATURE_AVX512VL +_FEATURE_AVX512VBMI +_FEATURE_AVX512_4FMAPS +_FEATURE_AVX512_4VNNIW +_FEATURE_AVX512_VPOPCNTDQ +_FEATURE_AVX512_BITALG +_FEATURE_AVX512_VBMI2 +_FEATURE_GFNI +_FEATURE_VAES +_FEATURE_VPCLMULQDQ +_FEATURE_AVX512_VNNI +_FEATURE_CLWB +_FEATURE_RDPID +_FEATURE_IBT +_FEATURE_SHSTK +_FEATURE_SGX +_FEATURE_WBNOINVD +_FEATURE_PCONFIG +_FEATURE_AXV512_4VNNIB +_FEATURE_AXV512_4FMAPH +_FEATURE_AXV512_BITALG2 +_FEATURE_AXV512_VP2INTERSECT + +
immintrin.h
+ General Support +
+ + + + Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This function is limited to bitmask values in the first 'page' of the libirc cpu-id information. This intrinsic does not check the processor vendor. See the valid feature flags below: + +_FEATURE_GENERIC_IA32 +_FEATURE_FPU +_FEATURE_CMOV +_FEATURE_MMX +_FEATURE_FXSAVE +_FEATURE_SSE +_FEATURE_SSE2 +_FEATURE_SSE3 +_FEATURE_SSSE3 +_FEATURE_SSE4_1 +_FEATURE_SSE4_2 +_FEATURE_MOVBE +_FEATURE_POPCNT +_FEATURE_PCLMULQDQ +_FEATURE_AES +_FEATURE_F16C +_FEATURE_AVX +_FEATURE_RDRND +_FEATURE_FMA +_FEATURE_BMI +_FEATURE_LZCNT +_FEATURE_HLE +_FEATURE_RTM +_FEATURE_AVX2 +_FEATURE_KNCNI +_FEATURE_AVX512F +_FEATURE_ADX +_FEATURE_RDSEED +_FEATURE_AVX512ER +_FEATURE_AVX512PF +_FEATURE_AVX512CD +_FEATURE_SHA +_FEATURE_MPX +_FEATURE_AVX512BW +_FEATURE_AVX512VL +_FEATURE_AVX512VBMI +_FEATURE_AVX512_4FMAPS +_FEATURE_AVX512_4VNNIW +_FEATURE_AVX512_VPOPCNTDQ +_FEATURE_AVX512_BITALG +_FEATURE_AVX512_VBMI2 +_FEATURE_GFNI +_FEATURE_VAES +_FEATURE_VPCLMULQDQ +_FEATURE_AVX512_VNNI +_FEATURE_CLWB +_FEATURE_RDPID +_FEATURE_IBT +_FEATURE_SHSTK +_FEATURE_SGX +_FEATURE_WBNOINVD +_FEATURE_PCONFIG +_FEATURE_AXV512_4VNNIB +_FEATURE_AXV512_4FMAPH +_FEATURE_AXV512_BITALG2 +_FEATURE_AXV512_VP2INTERSECT +_FEATURE_AXV512_FP16 + +
immintrin.h
+ General Support +
+ + + + + Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This works identically to the previous variant, except it also accepts a 'page' index that permits checking features on the 2nd page of the libirc information. When provided with a '0' in the 'page' parameter, this works identically to _may_i_use_cpu_feature. This intrinsic does not check the processor vendor. See the valid feature flags on the 2nd page below: (provided with a '1' in the 'page' parameter) + +_FEATURE_CLDEMOTE +_FEATURE_MOVDIRI +_FEATURE_MOVDIR64B +_FEATURE_WAITPKG +_FEATURE_AVX512_Bf16 +_FEATURE_ENQCMD +_FEATURE_AVX_VNNI +_FEATURE_AMX_TILE +_FEATURE_AMX_INT8 +_FEATURE_AMX_BF16 +_FEATURE_KL +_FEATURE_WIDE_KL +_FEATURE_HRESET +_FEATURE_UINTR +_FEATURE_PREFETCHI +_FEATURE_AVXVNNIINT8 +_FEATURE_CMPCCXADD +_FEATURE_AVXIFMA +_FEATURE_AVXNECONVERT +_FEATURE_RAOINT +_FEATURE_AMX_FP16 +_FEATURE_AMX_COMPLEX +_FEATURE_SHA512 +_FEATURE_SM3 +_FEATURE_SM4 +_FEATURE_AVXVNNIINT16 +_FEATURE_USERMSR +_FEATURE_AVX10_1_256 +_FEATURE_AVX10_1_512 +_FEATURE_APXF +_FEATURE_MSRLIST +_FEATURE_WRMSRNS +_FEATURE_PBNDKB + +
immintrin.h
+ General Support +
+ + + + Dynamically query the processor to determine if the processor-specific feature(s) specified a series of compile-time string literals in "feature, ..." are available, and return true or false (1 or 0) if the set of features is available. These feature names are converted to a bitmask and uses the same infrastructure as _may_i_use_cpu_feature_ext to validate it. The behavior is the same as the previous variants. This intrinsic does not check the processor vendor. Supported string literals are one-to-one corresponding in the "Operation" sections of _may_i_use_cpu_feature and _may_i_use_cpu_feature_ext. Example string literals are "avx2", "bmi", "avx512fp16", "amx-int8"... + + +
immintrin.h
+ General Support +
+ + + + Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific. + dst[63:0] := ReadPMC(a) + + +
immintrin.h
+ General Support +
+ + + + + + + Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). + +tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) +MEM[out+31:out] := tmp[31:0] +dst[0] := tmp[32] +dst[7:1] := 0 + + +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). + +tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) +MEM[out+63:out] := tmp[63:0] +dst[0] := tmp[64] +dst[7:1] := 0 + + +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). + +tmp[32:0] := a[31:0] - (b[31:0] + (c_in > 0 ? 1 : 0)) +MEM[out+31:out] := tmp[31:0] +dst[0] := tmp[32] +dst[7:1] := 0 + + +
immintrin.h
+ Arithmetic +
+ + + + + + + Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). + +tmp[64:0] := a[63:0] - (b[63:0] + (c_in > 0 ? 1 : 0)) +MEM[out+63:out] := tmp[63:0] +dst[0] := tmp[64] +dst[7:1] := 0 + + +
immintrin.h
+ Arithmetic +
+ + + + Insert the 32-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. + +
immintrin.h
+ Miscellaneous +
+ + + + Insert the 64-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. + +
immintrin.h
+ Miscellaneous +
+ + + + + Invoke the Intel SGX enclave user (non-privilege) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. + +
immintrin.h
+ Miscellaneous +
+ + + + + Invoke the Intel SGX enclave system (privileged) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. + +
immintrin.h
+ Miscellaneous +
+ + + + + Invoke the Intel SGX enclave virtualized (VMM) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. + +
immintrin.h
+ Miscellaneous +
+ + + + Write back and flush internal caches. + Initiate writing-back and flushing of external + caches. + +
immintrin.h
+ Miscellaneous +
+ + + + Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst". + +dst[31:0] := Convert_FP16_To_FP32(a[15:0]) + +
emmintrin.h
+ Convert +
+ + + + + Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst". + [round_note] + +dst[15:0] := Convert_FP32_To_FP16(a[31:0]) + +
emmintrin.h
+ Convert +
+ + + + + + + Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst". + +IF (imm8[0] == 0) + TEMP1 := a[63:0] +ELSE + TEMP1 := a[127:64] +FI +IF (imm8[4] == 0) + TEMP2 := b[63:0] +ELSE + TEMP2 := b[127:64] +FI +FOR i := 0 to 63 + TEMP[i] := (TEMP1[0] and TEMP2[i]) + FOR j := 1 to i + TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) + ENDFOR + dst[i] := TEMP[i] +ENDFOR +FOR i := 64 to 127 + TEMP[i] := 0 + FOR j := (i - 63) to 63 + TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) + ENDFOR + dst[i] := TEMP[i] +ENDFOR +dst[127] := 0 + + + PCLMULQDQ +
wmmintrin.h
+ Application-Targeted +
+ + + + + + + Invoke the PCONFIG leaf function specified by "a". The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. May return the value in eax, depending on the semantics of the specified leaf function. + + PCONFIG +
immintrin.h
+ Miscellaneous +
+ + + + + + Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". + +dst := 0 +FOR i := 0 to 31 + IF a[i] + dst := dst + 1 + FI +ENDFOR + + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". + +dst := 0 +FOR i := 0 to 63 + IF a[i] + dst := dst + 1 + FI +ENDFOR + + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst". + +dst := 0 +FOR i := 0 to 31 + IF a[i] + dst := dst + 1 + FI +ENDFOR + + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst". + +dst := 0 +FOR i := 0 to 63 + IF a[i] + dst := dst + 1 + FI +ENDFOR + + + POPCNT +
immintrin.h
+ Bit Manipulation +
+ + + + + + Loads an instruction sequence containing the specified memory address into all level cache. + + PREFETCHI +
x86gprintrin.h
+ General Support +
+ + + + Loads an instruction sequence containing the specified memory address into all but the first-level cache. + + PREFETCHI +
x86gprintrin.h
+ General Support +
+ + + + + Fetch the line of data from memory that contains address "p" to a location in the cache hierarchy specified by the locality hint "i", which can be one of:<ul> + <li>_MM_HINT_ET0 // 7, move data using the ET0 hint. The PREFETCHW instruction will be generated.</li> + <li>_MM_HINT_T0 // 3, move data using the T0 hint. The PREFETCHT0 instruction will be generated.</li> + <li>_MM_HINT_T1 // 2, move data using the T1 hint. The PREFETCHT1 instruction will be generated.</li> + <li>_MM_HINT_T2 // 1, move data using the T2 hint. The PREFETCHT2 instruction will be generated.</li> + <li>_MM_HINT_NTA // 0, move data using the non-temporal access (NTA) hint. The PREFETCHNTA instruction will be generated.</li> + + + + + + + PRFCHW +
immintrin.h
+ General Support +
+ + + + + Atomically add a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. + + +MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically add a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. + + +MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically and a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. + + +MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically and a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. + + +MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically or a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. + + +MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically or a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. + + +MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically xor a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. + + +MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + Atomically xor a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. + + +MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0] + + + + RAO_INT +
x86gprintrin.h
+ Arithmetic +
+ + + + Copy the IA32_TSC_AUX MSR (signature value) into "dst". + dst[31:0] := IA32_TSC_AUX[31:0] + + + RDPID +
immintrin.h
+ General Support +
+ + + + + + Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_RND_GEN.ready == 1 + val[15:0] := HW_RND_GEN.data + dst := 1 +ELSE + val[15:0] := 0 + dst := 0 +FI + + + RDRAND +
immintrin.h
+ Random +
+ + + + Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_RND_GEN.ready == 1 + val[31:0] := HW_RND_GEN.data + dst := 1 +ELSE + val[31:0] := 0 + dst := 0 +FI + + + RDRAND +
immintrin.h
+ Random +
+ + + + Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_RND_GEN.ready == 1 + val[63:0] := HW_RND_GEN.data + dst := 1 +ELSE + val[63:0] := 0 + dst := 0 +FI + + + RDRAND +
immintrin.h
+ Random +
+ + + + + + Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_NRND_GEN.ready == 1 + val[15:0] := HW_NRND_GEN.data + dst := 1 +ELSE + val[15:0] := 0 + dst := 0 +FI + + + RDSEED +
immintrin.h
+ Random +
+ + + + Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_NRND_GEN.ready == 1 + val[31:0] := HW_NRND_GEN.data + dst := 1 +ELSE + val[31:0] := 0 + dst := 0 +FI + + + RDSEED +
immintrin.h
+ Random +
+ + + + Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. + IF HW_NRND_GEN.ready == 1 + val[63:0] := HW_NRND_GEN.data + dst := 1 +ELSE + val[63:0] := 0 + dst := 0 +FI + + + RDSEED +
immintrin.h
+ Random +
+ + + + + + Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr". + dst[63:0] := TimeStampCounter +MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0] + + + RDTSCP +
immintrin.h
+ General Support +
+ + + + + + Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX. + Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction. + IF RTM_ACTIVE == 0 + // nop +ELSE + // restore architectural register state + // discard memory updates performed in transaction + // update EAX with status and imm8 value + eax[31:24] := imm8[7:0] + RTM_NEST_COUNT := 0 + RTM_ACTIVE := 0 + IF _64_BIT_MODE + RIP := fallbackRIP + ELSE + EIP := fallbackEIP + FI +FI + + + RTM +
immintrin.h
+ General Support +
+ + + + Specify the start of an RTM code region. + If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. + On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts. + IF RTM_NEST_COUNT < MAX_RTM_NEST_COUNT + RTM_NEST_COUNT := RTM_NEST_COUNT + 1 + IF RTM_NEST_COUNT == 1 + IF _64_BIT_MODE + fallbackRIP := RIP + ELSE IF _32_BIT_MODE + fallbackEIP := EIP + FI + + RTM_ACTIVE := 1 + // enter RTM execution, record register state, start tracking memory state + FI +ELSE + // RTM abort (see _xabort) +FI + + + RTM +
immintrin.h
+ General Support +
+ + + + Specify the end of an RTM code region. + If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. + If the commit fails, the logical processor will perform an RTM abort. + IF RTM_ACTIVE == 1 + RTM_NEST_COUNT := RTM_NEST_COUNT - 1 + IF RTM_NEST_COUNT == 0 + // try to commit transaction + IF FAIL_TO_COMMIT_TRANSACTION + // RTM abort (see _xabort) + ELSE + RTM_ACTIVE := 0 + FI + FI +FI + + + RTM +
immintrin.h
+ General Support +
+ + + + Query the transactional execution status, return 1 if inside a transactionally executing RTM or HLE region, and return 0 otherwise. + IF (RTM_ACTIVE == 1 OR HLE_ACTIVE == 1) + dst := 1 +ELSE + dst := 0 +FI + + + RTM +
immintrin.h
+ General Support +
+ + + + + Serialize instruction execution, ensuring all modifications to flags, registers, and memory by previous instructions are completed before the next instruction is fetched. + + SERIALIZE +
immintrin.h
+ General Support +
+ + + + + + + Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". + +W0 := a[127:96] +W1 := a[95:64] +W2 := a[63:32] +W3 := a[31:0] +W4 := b[127:96] +W5 := b[95:64] +dst[127:96] := W2 XOR W0 +dst[95:64] := W3 XOR W1 +dst[63:32] := W4 XOR W2 +dst[31:0] := W5 XOR W3 + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst". + +W13 := b[95:64] +W14 := b[63:32] +W15 := b[31:0] +W16 := (a[127:96] XOR W13) <<< 1 +W17 := (a[95:64] XOR W14) <<< 1 +W18 := (a[63:32] XOR W15) <<< 1 +W19 := (a[31:0] XOR W16) <<< 1 +dst[127:96] := W16 +dst[95:64] := W17 +dst[63:32] := W18 +dst[31:0] := W19 + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst". + +tmp := (a[127:96] <<< 30) +dst[127:96] := b[127:96] + tmp +dst[95:64] := b[95:64] +dst[63:32] := b[63:32] +dst[31:0] := b[31:0] + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + + Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants. + IF (func[1:0] == 0) + f := f0() + K := K0 +ELSE IF (func[1:0] == 1) + f := f1() + K := K1 +ELSE IF (func[1:0] == 2) + f := f2() + K := K2 +ELSE IF (func[1:0] == 3) + f := f3() + K := K3 +FI +A := a[127:96] +B := a[95:64] +C := a[63:32] +D := a[31:0] +W[0] := b[127:96] +W[1] := b[95:64] +W[2] := b[63:32] +W[3] := b[31:0] +A[1] := f(B, C, D) + (A <<< 5) + W[0] + K +B[1] := A +C[1] := B <<< 30 +D[1] := C +E[1] := D +FOR i := 1 to 3 + A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K + B[i+1] := A[i] + C[i+1] := B[i] <<< 30 + D[i+1] := C[i] + E[i+1] := D[i] +ENDFOR +dst[127:96] := A[4] +dst[95:64] := B[4] +dst[63:32] := C[4] +dst[31:0] := D[4] + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". + W4 := b[31:0] +W3 := a[127:96] +W2 := a[95:64] +W1 := a[63:32] +W0 := a[31:0] +dst[127:96] := W3 + sigma0(W4) +dst[95:64] := W2 + sigma0(W3) +dst[63:32] := W1 + sigma0(W2) +dst[31:0] := W0 + sigma0(W1) + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"." + W14 := b[95:64] +W15 := b[127:96] +W16 := a[31:0] + sigma1(W14) +W17 := a[63:32] + sigma1(W15) +W18 := a[95:64] + sigma1(W16) +W19 := a[127:96] + sigma1(W17) +dst[127:96] := W19 +dst[95:64] := W18 +dst[63:32] := W17 +dst[31:0] := W16 + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + + Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst". + A[0] := b[127:96] +B[0] := b[95:64] +C[0] := a[127:96] +D[0] := a[95:64] +E[0] := b[63:32] +F[0] := b[31:0] +G[0] := a[63:32] +H[0] := a[31:0] +W_K[0] := k[31:0] +W_K[1] := k[63:32] +FOR i := 0 to 1 + A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]) + B[i+1] := A[i] + C[i+1] := B[i] + D[i+1] := C[i] + E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i] + F[i+1] := E[i] + G[i+1] := F[i] + H[i+1] := G[i] +ENDFOR +dst[127:96] := A[2] +dst[95:64] := B[2] +dst[63:32] := E[2] +dst[31:0] := F[2] + + + SHA +
immintrin.h
+ Cryptography +
+ + + + + This intrinisc is one of the two SHA512 message scheduling instructions. The intrinsic performs an intermediate calculation for the next four SHA512 message qwords. The calculated results are stored in "dst". + + +DEFINE ROR64(qword, n) { + count := n % 64 + dest := (qword >> count) | (qword << (64 - count)) + RETURN dest +} +DEFINE SHR64(qword, n) { + RETURN qword >> n +} +DEFINE s0(qword) { + RETURN ROR64(qword,1) ^ ROR64(qword, 8) ^ SHR64(qword, 7) +} +W.qword[4] := __B.qword[0] +W.qword[3] := __A.qword[3] +W.qword[2] := __A.qword[2] +W.qword[1] := __A.qword[1] +W.qword[0] := __A.qword[0] +dst.qword[3] := W.qword[3] + s0(W.qword[4]) +dst.qword[2] := W.qword[2] + s0(W.qword[3]) +dst.qword[1] := W.qword[1] + s0(W.qword[2]) +dst.qword[0] := W.qword[0] + s0(W.qword[1]) + + + + SHA512 + AVX +
immintrin.h
+ Cryptography +
+ + + This intrinisc is one of the two SHA512 message scheduling instructions. The intrinsic performs the final calculation for the next four SHA512 message qwords. The calculated results are stored in "dst". + + +DEFINE ROR64(qword, n) { + count := n % 64 + dest := (qword >> count) | (qword << (64 - count)) + RETURN dest +} +DEFINE SHR64(qword, n) { + RETURN qword >> n +} +DEFINE s1(qword) { + RETURN ROR64(qword,19) ^ ROR64(qword, 61) ^ SHR64(qword, 6) +} +W.qword[14] := __B.qword[2] +W.qword[15] := __B.qword[3] +W.qword[16] := __A.qword[0] + s1(W.qword[14]) +W.qword[17] := __A.qword[1] + s1(W.qword[15]) +W.qword[18] := __A.qword[2] + s1(W.qword[16]) +W.qword[19] := __A.qword[3] + s1(W.qword[17]) +dst.qword[3] := W.qword[19] +dst.qword[2] := W.qword[18] +dst.qword[1] := W.qword[17] +dst.qword[0] := W.qword[16] + + + + SHA512 + AVX +
immintrin.h
+ Cryptography +
+ + + This intrinisc performs two rounds of SHA512 operation using initial SHA512 state (C,D,G,H) from "__A", an initial SHA512 state (A,B,E,F) from "__B", and a pre-computed sum of the next two round message qwords and the corresponding round constants from "__C" (only the two lower qwords of the third operand). The updated SHA512 state (A,B,E,F) is written to "dst", and "dst" can be used as the updated state (C,D,G,H) in later rounds. + + +DEFINE ROR64(qword, n) { + count := n % 64 + dest := (qword >> count) | (qword << (64 - count)) + RETURN dest +} +DEFINE SHR64(qword, n) { + RETURN qword >> n +} +DEFINE cap_sigma0(qword) { + RETURN ROR64(qword, 28) ^ ROR64(qword, 34) ^ ROR64(qword, 39) +} +DEFINE cap_sigma1(qword) { + RETURN ROR64(qword, 14) ^ ROR64(qword, 18) ^ ROR64(qword, 41) +} +DEFINE MAJ(a,b,c) { + RETURN (a & b) ^ (a & c) ^ (b & c) +} +DEFINE CH(a,b,c) { + RETURN (a & b) ^ (c & ~a) +} +A.qword[0] := __B.qword[3] +B.qword[0] := __B.qword[2] +C.qword[0] := __A.qword[3] +D.qword[0] := __A.qword[2] +E.qword[0] := __B.qword[1] +F.qword[0] := __B.qword[0] +G.qword[0] := __A.qword[1] +H.qword[0] := __A.qword[0] +WK.qword[0]:= __C.qword[0] +WK.qword[1]:= __C.qword[1] +FOR i := 0 to 1 + A.qword[i+1] := CH(E.qword[i], F.qword[i], G.qword[i]) + cap_sigma1(E.qword[i]) + WK.qword[i] + H.qword[i] + MAJ(A.qword[i], B.qword[i], C.qword[i]) + cap_sigma0(A.qword[i]) + B.qword[i+1] := A.qword[i] + C.qword[i+1] := B.qword[i] + D.qword[i+1] := C.qword[i] + E.qword[i+1] := CH(E.qword[i], F.qword[i], G.qword[i]) + cap_sigma1(E.qword[i]) + WK.qword[i] + H.qword[i] + D.qword[i] + F.qword[i+1] := E.qword[i] + G.qword[i+1] := F.qword[i] + H.qword[i+1] := G.qword[i] +ENDFOR +dst.qword[3] := A.qword[2] +dst.qword[2] := B.qword[2] +dst.qword[1] := E.qword[2] +dst.qword[0] := F.qword[2] + + + + + SHA512 + AVX +
immintrin.h
+ Cryptography +
+ + + The VSM3MSG1 intrinsic is one of the two SM3 message scheduling intrinsics. The intrinsic performs an initial calculation for the next four SM3 message words. The calculated results are stored in "dst". + + +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32 - count)) + RETURN dest +} +DEFINE P1(x) { + RETURN x ^ ROL32(x, 15) ^ ROL32(x, 23) +} +W.dword[0] := __C.dword[0] +W.dword[1] := __C.dword[1] +W.dword[2] := __C.dword[2] +W.dword[3] := __C.dword[3] +W.dword[7] := __A.dword[0] +W.dword[8] := __A.dword[1] +W.dword[9] := __A.dword[2] +W.dword[10] := __A.dword[3] +W.dword[13] := __B.dword[0] +W.dword[14] := __B.dword[1] +W.dword[15] := __B.dword[2] +TMP0 := W.dword[7] ^ W.dword[0] ^ ROL32(W.dword[13], 15) +TMP1 := W.dword[8] ^ W.dword[1] ^ ROL32(W.dword[14], 15) +TMP2 := W.dword[9] ^ W.dword[2] ^ ROL32(W.dword[15], 15) +TMP3 := W.dword[10] ^ W.dword[3] +dst.dword[0] := P1(TMP0) +dst.dword[1] := P1(TMP1) +dst.dword[2] := P1(TMP2) +dst.dword[3] := P1(TMP3) + + + + + SM3 + AVX +
immintrin.h
+ Cryptography +
+ + + The VSM3MSG2 intrinsic is one of the two SM3 message scheduling intrinsics. The intrinsic performs the final calculation for the next four SM3 message words. The calculated results are stored in "dst". + + +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32-count)) + RETURN dest +} +WTMP.dword[0] := __A.dword[0] +WTMP.dword[1] := __A.dword[1] +WTMP.dword[2] := __A.dword[2] +WTMP.dword[3] := __A.dword[3] +W.dword[3] := __B.dword[0] +W.dword[4] := __B.dword[1] +W.dword[5] := __B.dword[2] +W.dword[6] := __B.dword[3] +W.dword[10] := __C.dword[0] +W.dword[11] := __C.dword[1] +W.dword[12] := __C.dword[2] +W.dword[13] := __C.dword[3] +W.dword[16] := ROL32(W.dword[3], 7) ^ W.dword[10] ^ WTMP.dword[0] +W.dword[17] := ROL32(W.dword[4], 7) ^ W.dword[11] ^ WTMP.dword[1] +W.dword[18] := ROL32(W.dword[5], 7) ^ W.dword[12] ^ WTMP.dword[2] +W.dword[19] := ROL32(W.dword[6], 7) ^ W.dword[13] ^ WTMP.dword[3] +W.dword[19] := W.dword[19] ^ ROL32(W.dword[16], 6) ^ ROL32(W.dword[16], 15) ^ ROL32(W.dword[16], 30) +dst.dword[0] := W.dword[16] +dst.dword[1] := W.dword[17] +dst.dword[2] := W.dword[18] +dst.dword[3] := W.dword[19] + + + + + SM3 + AVX +
immintrin.h
+ Cryptography +
+ + + The intrinsic performs two rounds of SM3 operation using initial SM3 state (C, D, G, H) from "__A", an initial SM3 states (A, B, E, F) from "__B" and a pre-computed words from the "__C". "__A" with initial SM3 state of (C, D, G, H) assumes input of non-rotated left variables from previous state. The updated SM3 state (A, B, E, F) is written to "__A". The "imm8" should contain the even round number for the first of the two rounds computed by this instruction. The computation masks the "imm8" value by ANDing it with 0x3E so that only even round numbers from 0 through 62 are used for this operation. The calculated results are stored in "dst". + + +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32-count)) + RETURN dest +} +DEFINE P0(x) { + RETURN x ^ ROL32(x, 9) ^ ROL32(x, 17) +} +DEFINE FF(x, y, z, round) { + IF round < 16 + RETURN (x ^ y ^ z) + ELSE + RETURN (x & y) | (x & z) | (y & z) + FI +} +DEFINE GG(x, y, z, round){ + IF round < 16 + RETURN (x ^ y ^ z) + ELSE + RETURN (x & y) | (~x & z) + FI +} +A.dword[0] := __B.dword[3] +B.dword[0] := __B.dword[2] +C.dword[0] := __A.dword[3] +D.dword[0] := __A.dword[2] +E.dword[0] := __B.dword[1] +F.dword[0] := __B.dword[0] +G.dword[0] := __A.dword[1] +H.dword[0] := __A.dword[0] +W.dword[0] := __C.dword[0] +W.dword[1] := __C.dword[1] +W.dword[4] := __C.dword[2] +W.dword[5] := __C.dword[3] +C.dword[0] := ROL32(C.dword[0], 9) +D.dword[0] := ROL32(D.dword[0], 9) +G.dword[0] := ROL32(G.dword[0], 19) +H.dword[0] := ROL32(H.dword[0], 19) +ROUND := imm8 & 0x3E +IF ROUND < 16 + CONST.dword[0] := 0x79CC4519 +ELSE + CONST.dword[0] := 0x7A879D8A +FI +CONST.dword[0] := ROL32(CONST.dword[0], ROUND) +FOR i:= 0 to 1 + temp.dword[0] := ROL32(A.dword[i], 12) + E.dword[i] + CONST.dword[0] + S1.dword[0] := ROL32(temp.dword[0], 7) + S2.dword[0] := S1.dword[0] ^ ROL32(A.dword[i], 12) + T1.dword[0] := FF(A.dword[i], B.dword[i], C.dword[i], ROUND) + D.dword[i] + S2.dword[0] + (W.dword[i] ^ W.dword[i+4]) + T2.dword[0] := GG(E.dword[i], F.dword[i], G.dword[i], ROUND) + H.dword[i] + S1.dword[0] + W.dword[i] + D.dword[i+1] := C.dword[i] + C.dword[i+1] := ROL32(B.dword[i], 9) + B.dword[i+1] := A.dword[i] + A.dword[i+1] := T1.dword[0] + H.dword[i+1] := G.dword[i] + G.dword[i+1] := ROL32(F.dword[i], 19) + F.dword[i+1] := E.dword[i] + E.dword[i+1] := P0(T2.dword[0]) + CONST.dword[0] := ROL32(CONST.dword[0], 1) +ENDFOR +dst.dword[3] := A.dword[2] +dst.dword[2] := B.dword[2] +dst.dword[1] := E.dword[2] +dst.dword[0] := F.dword[2] + + + + + + SM3 + AVX +
immintrin.h
+ Cryptography +
+ + + This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent 128-bit lanes. The calculated results are stored in "dst". + + +BYTE sbox[256] = { +0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, +0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, +0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, +0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, +0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, +0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, +0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, +0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, +0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, +0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, +0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, +0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, +0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, +0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, +0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, +0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 +} +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32-count)) + RETURN dest +} +DEFINE SBOX_BYTE(dword, i) { + RETURN sbox[dword.byte[i]] +} +DEFINE lower_t(dword) { + tmp.byte[0] := SBOX_BYTE(dword, 0) + tmp.byte[1] := SBOX_BYTE(dword, 1) + tmp.byte[2] := SBOX_BYTE(dword, 2) + tmp.byte[3] := SBOX_BYTE(dword, 3) + RETURN tmp +} +DEFINE L_KEY(dword) { + RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) +} +DEFINE T_KEY(dword) { + RETURN L_KEY(lower_t(dword)) +} +DEFINE F_KEY(X0, X1, X2, X3, round_key) { + RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) +} +FOR i:= 0 to 1 + P.dword[0] := __A.dword[4*i] + P.dword[1] := __A.dword[4*i+1] + P.dword[2] := __A.dword[4*i+2] + P.dword[3] := __A.dword[4*i+3] + C.dword[0] := F_KEY(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[4*i]) + C.dword[1] := F_KEY(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[4*i+1]) + C.dword[2] := F_KEY(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[4*i+2]) + C.dword[3] := F_KEY(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[4*i+3]) + dst.dword[4*i] := C.dword[0] + dst.dword[4*i+1] := C.dword[1] + dst.dword[4*i+2] := C.dword[2] + dst.dword[4*i+3] := C.dword[3] +ENDFOR +dst[MAX:256] := 0 + + + + SM4 + AVX +
immintrin.h
+ Cryptography +
+ + + This intrinisc performs four rounds of SM4 encryption. The intrinisc operates on independent 128-bit lanes. The calculated results are stored in "dst". + + BYTE sbox[256] = { +0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, +0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, +0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, +0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, +0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, +0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, +0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, +0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, +0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, +0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, +0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, +0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, +0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, +0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, +0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, +0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 +} +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32-count)) + RETURN dest +} +DEFINE SBOX_BYTE(dword, i) { + RETURN sbox[dword.byte[i]] +} +DEFINE lower_t(dword) { + tmp.byte[0] := SBOX_BYTE(dword, 0) + tmp.byte[1] := SBOX_BYTE(dword, 1) + tmp.byte[2] := SBOX_BYTE(dword, 2) + tmp.byte[3] := SBOX_BYTE(dword, 3) + RETURN tmp +} +DEFINE L_RND(dword) { + tmp := dword + tmp := tmp ^ ROL32(dword, 2) + tmp := tmp ^ ROL32(dword, 10) + tmp := tmp ^ ROL32(dword, 18) + tmp := tmp ^ ROL32(dword, 24) + RETURN tmp +} +DEFINE T_RND(dword) { + RETURN L_RND(lower_t(dword)) +} +DEFINE F_RND(X0, X1, X2, X3, round_key) { + RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) +} +FOR i:= 0 to 1 + P.dword[0] := __A.dword[4*i] + P.dword[1] := __A.dword[4*i+1] + P.dword[2] := __A.dword[4*i+2] + P.dword[3] := __A.dword[4*i+3] + C.dword[0] := F_RND(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[4*i]) + C.dword[1] := F_RND(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[4*i+1]) + C.dword[2] := F_RND(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[4*i+2]) + C.dword[3] := F_RND(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[4*i+3]) + dst.dword[4*i] := C.dword[0] + dst.dword[4*i+1] := C.dword[1] + dst.dword[4*i+2] := C.dword[2] + dst.dword[4*i+3] := C.dword[3] +ENDFOR +dst[MAX:256] := 0 + + + + SM4 + AVX +
immintrin.h
+ Cryptography +
+ + + This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent 128-bit lanes. The calculated results are stored in "dst". + + +BYTE sbox[256] = { +0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, +0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, +0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, +0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, +0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, +0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, +0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, +0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, +0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, +0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, +0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, +0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, +0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, +0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, +0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, +0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 +} +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32-count)) + RETURN dest +} +DEFINE SBOX_BYTE(dword, i) { + RETURN sbox[dword.byte[i]] +} +DEFINE lower_t(dword) { + tmp.byte[0] := SBOX_BYTE(dword, 0) + tmp.byte[1] := SBOX_BYTE(dword, 1) + tmp.byte[2] := SBOX_BYTE(dword, 2) + tmp.byte[3] := SBOX_BYTE(dword, 3) + RETURN tmp +} +DEFINE L_KEY(dword) { + RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) +} +DEFINE T_KEY(dword) { + RETURN L_KEY(lower_t(dword)) +} +DEFINE F_KEY(X0, X1, X2, X3, round_key) { + RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) +} +P.dword[0] := __A.dword[0] +P.dword[1] := __A.dword[1] +P.dword[2] := __A.dword[2] +P.dword[3] := __A.dword[3] +C.dword[0] := F_KEY(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[0]) +C.dword[1] := F_KEY(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[1]) +C.dword[2] := F_KEY(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[2]) +C.dword[3] := F_KEY(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[3]) +dst.dword[0] := C.dword[0] +dst.dword[1] := C.dword[1] +dst.dword[2] := C.dword[2] +dst.dword[3] := C.dword[3] +dst[MAX:128] := 0 + + + + SM4 + AVX +
immintrin.h
+ Cryptography +
+ + + This intrinisc performs four rounds of SM4 encryption. The intrinisc operates on independent 128-bit lanes. The calculated results are stored in "dst". + + +BYTE sbox[256] = { +0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, +0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, +0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, +0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, +0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, +0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, +0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, +0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, +0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, +0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, +0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, +0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, +0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, +0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, +0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, +0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 +} +DEFINE ROL32(dword, n) { + count := n % 32 + dest := (dword << count) | (dword >> (32-count)) + RETURN dest +} +DEFINE SBOX_BYTE(dword, i) { + RETURN sbox[dword.byte[i]] +} +DEFINE lower_t(dword) { + tmp.byte[0] := SBOX_BYTE(dword, 0) + tmp.byte[1] := SBOX_BYTE(dword, 1) + tmp.byte[2] := SBOX_BYTE(dword, 2) + tmp.byte[3] := SBOX_BYTE(dword, 3) + RETURN tmp +} +DEFINE L_RND(dword) { + tmp := dword + tmp := tmp ^ ROL32(dword, 2) + tmp := tmp ^ ROL32(dword, 10) + tmp := tmp ^ ROL32(dword, 18) + tmp := tmp ^ ROL32(dword, 24) + RETURN tmp +} +DEFINE T_RND(dword) { + RETURN L_RND(lower_t(dword)) +} +DEFINE F_RND(X0, X1, X2, X3, round_key) { + RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) +} +P.dword[0] := __A.dword[0] +P.dword[1] := __A.dword[1] +P.dword[2] := __A.dword[2] +P.dword[3] := __A.dword[3] +C.dword[0] := F_RND(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[0]) +C.dword[1] := F_RND(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[1]) +C.dword[2] := F_RND(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[2]) +C.dword[3] := F_RND(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[3]) +dst.dword[0] := C.dword[0] +dst.dword[1] := C.dword[1] +dst.dword[2] := C.dword[2] +dst.dword[3] := C.dword[3] +dst[MAX:128] := 0 + + + + SM4 + AVX +
immintrin.h
+ Cryptography +
+ + + + Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ACOS(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ACOS(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ACOSH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ACOSH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ASIN(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ASIN(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ASINH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ASINH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ATAN(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ATAN(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + + Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ATANH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ATANH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := COSD(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := COSD(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := COSH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := COSH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + + Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SIN(a[i+63:i]) + MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + + Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SIN(a[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SIND(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SIND(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SINH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SINH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := TAN(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := TAN(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := TAND(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := TAND(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := TANH(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := TANH(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Trigonometry +
+ + + + Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CubeRoot(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := CubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CEXP(a[31:0], b[31:0]) { + result[31:0] := POW(FP32(e), a[31:0]) * COS(b[31:0]) + result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0]) + RETURN result +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CLOG(a[31:0], b[31:0]) { + result[31:0] := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0))) + result[63:32] := ATAN2(b, a) + RETURN result +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". + +DEFINE CSQRT(a[31:0], b[31:0]) { + sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) + result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) + RETURN result +} +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(10.0, a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(2.0, a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := InvCubeRoot(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := InvCubeRoot(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := InvSQRT(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := InvSQRT(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(1.0 + a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(1.0 + a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ConvertExpFP64(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ConvertExpFP32(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + + Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Elementary Math Functions +
+ + + + Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := CDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := InverseCDFNormal(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := InverseCDFNormal(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ERF(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := 1.0 - ERF(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+63:i] := 1.0 - ERF(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := 1.0 / ERF(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+63:i] := 1.0 / ERF(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Probability/Statistics +
+ + + + + Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 7 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 3 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 1 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 15 + i := 8*j + IF b[i+7:i] == 0 + #DE + FI + dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 7 + i := 16*j + IF b[i+15:i] == 0 + #DE + FI + dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 3 + i := 32*j + IF b[i+31:i] == 0 + #DE + FI + dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + +FOR j := 0 to 1 + i := 64*j + IF b[i+63:i] == 0 + #DE + FI + dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ERF(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 15 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 7 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". + FOR j := 0 to 1 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 15 + i := 8*j + dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 7 + i := 16*j + dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 1 + i := 64*j + dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) + MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + + Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". + FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Arithmetic +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Special Math Functions +
+ + + + Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := TRUNCATE(a[i+63:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Miscellaneous +
+ + + + Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := TRUNCATE(a[i+31:i]) +ENDFOR +dst[MAX:128] := 0 + + SSE +
immintrin.h
+ Miscellaneous +
+ + + + + + + + + Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.). + +__m128 tmp3, tmp2, tmp1, tmp0; +tmp0 := _mm_unpacklo_ps(row0, row1); +tmp2 := _mm_unpacklo_ps(row2, row3); +tmp1 := _mm_unpackhi_ps(row0, row1); +tmp3 := _mm_unpackhi_ps(row2, row3); +row0 := _mm_movelh_ps(tmp0, tmp2); +row1 := _mm_movehl_ps(tmp2, tmp0); +row2 := _mm_movelh_ps(tmp1, tmp3); +row3 := _mm_movehl_ps(tmp3, tmp1); + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + +dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] +dst[31:16] := 0 + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + +dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] +dst[31:16] := 0 + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". + +dst[63:0] := a[63:0] +sel := imm8[1:0]*16 +dst[sel+15:sel] := i[15:0] + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". + +dst[63:0] := a[63:0] +sel := imm8[1:0]*16 +dst[sel+15:sel] := i[15:0] + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[15:0] := src[15:0] + 1: tmp[15:0] := src[31:16] + 2: tmp[15:0] := src[47:32] + 3: tmp[15:0] := src[63:48] + ESAC + RETURN tmp[15:0] +} +dst[15:0] := SELECT4(a[63:0], imm8[1:0]) +dst[31:16] := SELECT4(a[63:0], imm8[3:2]) +dst[47:32] := SELECT4(a[63:0], imm8[5:4]) +dst[63:48] := SELECT4(a[63:0], imm8[7:6]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[15:0] := src[15:0] + 1: tmp[15:0] := src[31:16] + 2: tmp[15:0] := src[47:32] + 3: tmp[15:0] := src[63:48] + ESAC + RETURN tmp[15:0] +} +dst[15:0] := SELECT4(a[63:0], imm8[1:0]) +dst[31:16] := SELECT4(a[63:0], imm8[3:2]) +dst[47:32] := SELECT4(a[63:0], imm8[5:4]) +dst[63:48] := SELECT4(a[63:0], imm8[7:6]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + + Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(b[127:0], imm8[5:4]) +dst[127:96] := SELECT4(b[127:0], imm8[7:6]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) + + + SSE +
xmmintrin.h
+ Swizzle +
+ + + + Get the unsigned 32-bit value of the MXCSR control and status register. + dst[31:0] := MXCSR + + + SSE +
immintrin.h
+ General Support +
+ + + + Set the MXCSR control and status register with the value in unsigned 32-bit integer "a". + +MXCSR := a[31:0] + + + SSE +
immintrin.h
+ General Support +
+ + + Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT + dst[31:0] := MXCSR & _MM_EXCEPT_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT + MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK + + SSE +
immintrin.h
+ General Support +
+ + + Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT + dst[31:0] := MXCSR & _MM_MASK_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT + MXCSR := a[31:0] AND ~_MM_MASK_MASK + + SSE +
immintrin.h
+ General Support +
+ + + Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + dst[31:0] := MXCSR & _MM_ROUND_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + MXCSR := a[31:0] AND ~_MM_ROUND_MASK + + SSE +
immintrin.h
+ General Support +
+ + + Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF + dst[31:0] := MXCSR & _MM_FLUSH_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF + MXCSR := a[31:0] AND ~_MM_FLUSH_MASK + + SSE +
immintrin.h
+ General Support +
+ + + + + Fetch the line of data from memory that contains address "p" to a location in the cache hierarchy specified by the locality hint "i", which can be one of:<ul> + <li>_MM_HINT_T0 // 3, move data using the T0 hint. The PREFETCHT0 instruction will be generated.</li> + <li>_MM_HINT_T1 // 2, move data using the T1 hint. The PREFETCHT1 instruction will be generated.</li> + <li>_MM_HINT_T2 // 1, move data using the T2 hint. The PREFETCHT2 instruction will be generated.</li> + <li>_MM_HINT_NTA // 0, move data using the non-temporal access (NTA) hint. The PREFETCHNTA instruction will be generated.</li> + + + + + + SSE +
immintrin.h
+ General Support +
+ + + + Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order. + + SSE +
immintrin.h
+ General Support +
+ + + + + Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc". + SSE +
immintrin.h
+ General Support +
+ + + + Free aligned memory that was allocated with "_mm_malloc". + SSE +
immintrin.h
+ General Support +
+ + + + Return vector of type __m128 with undefined elements. + SSE +
immintrin.h
+ General Support +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [min_float_note] + +dst[31:0] := MIN(a[31:0], b[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [max_float_note] + +dst[31:0] := MAX(a[31:0], b[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Special Math Functions +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] +ENDFOR + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] +ENDFOR + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". + +FOR j := 0 to 7 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] +dst[63:16] := 0 + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". + +FOR j := 0 to 7 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR +dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] +dst[63:16] := 0 + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] + b[31:0] +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] - b[31:0] +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] * b[31:0] +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] * b[i+31:i] +ENDFOR + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := a[31:0] / b[31:0] +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := a[i+31:i] / b[i+31:i] +ENDFOR + + + SSE +
xmmintrin.h
+ Arithmetic +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +ENDFOR + + + SSE +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +ENDFOR + + + SSE +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR + + + SSE +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR + + + SSE +
xmmintrin.h
+ Probability/Statistics +
+ + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + SSE +
xmmintrin.h
+ Convert +
+ + + + + Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] + + + SSE +
xmmintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + m := j*32 + dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +ENDFOR + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*16 + m := j*32 + dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +ENDFOR + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*8 + m := j*32 + dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +ENDFOR + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := j*8 + m := j*32 + dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +ENDFOR + + SSE +
xmmintrin.h
+ Convert +
+ + + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed signed 32-bit integers in "b" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst". + +dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +dst[127:96] := Convert_Int32_To_FP32(b[63:32]) + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32(a[31:0]) + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_Int64(a[31:0]) + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". + +dst[31:0] := a[31:0] + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. + +FOR j := 0 to 3 + i := 16*j + k := 32*j + IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) + dst[i+15:i] := 0x7FFF + ELSE + dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) + FI +ENDFOR + + SSE +
xmmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst". Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. + +FOR j := 0 to 3 + i := 8*j + k := 32*j + IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) + dst[i+7:i] := 0x7F + ELSE + dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) + FI +ENDFOR + + SSE +
xmmintrin.h
+ Convert +
+ + + + + Store 64-bits of integer data from "a" into memory using a non-temporal memory hint. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + SSE +
immintrin.h
+ Store +
+ + + + + + Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. + +FOR j := 0 to 7 + i := j*8 + IF mask[i+7] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + SSE +
immintrin.h
+ Store +
+ + + + + + Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). + +FOR j := 0 to 7 + i := j*8 + IF mask[i+7] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + SSE +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE +
immintrin.h
+ Store +
+ + + + + Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory. + +MEM[mem_addr+31:mem_addr] := a[95:64] +MEM[mem_addr+63:mem_addr+32] := a[127:96] + + + SSE +
immintrin.h
+ Store +
+ + + + + Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory. + +MEM[mem_addr+31:mem_addr] := a[31:0] +MEM[mem_addr+63:mem_addr+32] := a[63:32] + + + SSE +
immintrin.h
+ Store +
+ + + + + Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+31:mem_addr] := a[31:0] + + + SSE +
immintrin.h
+ Store +
+ + + + + Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+31:mem_addr] := a[31:0] +MEM[mem_addr+63:mem_addr+32] := a[31:0] +MEM[mem_addr+95:mem_addr+64] := a[31:0] +MEM[mem_addr+127:mem_addr+96] := a[31:0] + + SSE +
immintrin.h
+ Store +
+ + + + + Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+31:mem_addr] := a[31:0] +MEM[mem_addr+63:mem_addr+32] := a[31:0] +MEM[mem_addr+95:mem_addr+64] := a[31:0] +MEM[mem_addr+127:mem_addr+96] := a[31:0] + + SSE +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE +
immintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE +
immintrin.h
+ Store +
+ + + + + Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+31:mem_addr] := a[127:96] +MEM[mem_addr+63:mem_addr+32] := a[95:64] +MEM[mem_addr+95:mem_addr+64] := a[63:32] +MEM[mem_addr+127:mem_addr+96] := a[31:0] + + + SSE +
immintrin.h
+ Store +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[j] := a[i+7] +ENDFOR +dst[MAX:8] := 0 + + + SSE +
xmmintrin.h
+ Miscellaneous +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[j] := a[i+7] +ENDFOR +dst[MAX:8] := 0 + + + SSE +
xmmintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". + +FOR j := 0 to 3 + i := j*32 + IF a[i+31] + dst[j] := 1 + ELSE + dst[j] := 0 + FI +ENDFOR +dst[MAX:4] := 0 + + + SSE +
xmmintrin.h
+ Miscellaneous +
+ + + + Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := SQRT(a[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SQRT(a[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +dst[31:0] := (1.0 / a[31:0]) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (1.0 / a[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +dst[31:0] := (1.0 / SQRT(a[31:0])) +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) +ENDFOR + + + SSE +
xmmintrin.h
+ Elementary Math Functions +
+ + + + + Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) +ENDFOR + + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] OR b[i+31:i] +ENDFOR + + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] XOR b[i+31:i] +ENDFOR + + + SSE +
xmmintrin.h
+ Logical +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (!( a[31:0] < b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (!( a[31:0] <= b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (!( a[i+31:i] <= b[i+31:i] )) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (!( a[31:0] > b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (!( a[i+31:i] > b[i+31:i] )) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := (!( a[31:0] >= b[31:0] )) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := (!( a[i+31:i] >= b[i+31:i] )) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xFFFFFFFF : 0 +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". + FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] < b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] <= b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] > b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] >= b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). + RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] < b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] <= b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] > b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] >= b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + + Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0 + + + SSE +
xmmintrin.h
+ Compare +
+ + + + Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements. + +dst[31:0] := a[31:0] +dst[127:32] := 0 + + SSE +
xmmintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR + + SSE +
xmmintrin.h
+ Set +
+ + + + Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR + + SSE +
xmmintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 + + SSE +
xmmintrin.h
+ Set +
+ + + + + + + Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[31:0] := e3 +dst[63:32] := e2 +dst[95:64] := e1 +dst[127:96] := e0 + + SSE +
xmmintrin.h
+ Set +
+ + + + Return vector of type __m128 with all elements set to zero. + +dst[MAX:0] := 0 + + + SSE +
xmmintrin.h
+ Set +
+ + + + + Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. + +dst[31:0] := a[31:0] +dst[63:32] := a[63:32] +dst[95:64] := MEM[mem_addr+31:mem_addr] +dst[127:96] := MEM[mem_addr+63:mem_addr+32] + + + SSE +
immintrin.h
+ Load +
+ + + + + Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. + +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[63:32] := MEM[mem_addr+63:mem_addr+32] +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] + + + SSE +
immintrin.h
+ Load +
+ + + + Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary. + +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[127:32] := 0 + + + SSE +
immintrin.h
+ Load +
+ + + + Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". + +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[63:32] := MEM[mem_addr+31:mem_addr] +dst[95:64] := MEM[mem_addr+31:mem_addr] +dst[127:96] := MEM[mem_addr+31:mem_addr] + + SSE +
immintrin.h
+ Load +
+ + + + Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". + +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[63:32] := MEM[mem_addr+31:mem_addr] +dst[95:64] := MEM[mem_addr+31:mem_addr] +dst[127:96] := MEM[mem_addr+31:mem_addr] + + SSE +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE +
immintrin.h
+ Load +
+ + + + Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE +
immintrin.h
+ Load +
+ + + + Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[31:0] := MEM[mem_addr+127:mem_addr+96] +dst[63:32] := MEM[mem_addr+95:mem_addr+64] +dst[95:64] := MEM[mem_addr+63:mem_addr+32] +dst[127:96] := MEM[mem_addr+31:mem_addr] + + SSE +
immintrin.h
+ Load +
+ + + + + Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := b[31:0] +dst[127:32] := a[127:32] + + + SSE +
xmmintrin.h
+ Move +
+ + + + + Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst". + +dst[31:0] := b[95:64] +dst[63:32] := b[127:96] +dst[95:64] := a[95:64] +dst[127:96] := a[127:96] + + + SSE +
xmmintrin.h
+ Move +
+ + + + + Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst". + +dst[31:0] := a[31:0] +dst[63:32] := a[63:32] +dst[95:64] := b[31:0] +dst[127:96] := b[63:32] + + + SSE +
xmmintrin.h
+ Move +
+ + + + + + Return vector of type __m128d with undefined elements. + SSE2 +
emmintrin.h
+ General Support +
+ + + + Return vector of type __m128i with undefined elements. + SSE2 +
emmintrin.h
+ General Support +
+ + + + Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops. + + SSE2 +
emmintrin.h
+ General Support +
+ + + + Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. + + SSE2 +
emmintrin.h
+ General Support +
+ + + + Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order. + + SSE2 +
emmintrin.h
+ General Support +
+ + + + Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order. + + SSE2 +
emmintrin.h
+ General Support +
+ + + + Load unaligned 64-bit integer from memory into the first element of "dst". + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[MAX:64] := 0 + + + SSE2 +
immintrin.h
+ Load +
+ + + + Load unaligned 16-bit integer from memory into the first element of "dst". + +dst[15:0] := MEM[mem_addr+15:mem_addr] +dst[MAX:16] := 0 + + SSE2 +
immintrin.h
+ Load +
+ + + + Load unaligned 32-bit integer from memory into the first element of "dst". + +dst[31:0] := MEM[mem_addr+31:mem_addr] +dst[MAX:32] := 0 + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load 64-bit integer from memory into the first element of "dst". + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[MAX:64] := 0 + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load 128-bits of integer data from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load 128-bits of integer data from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := MEM[mem_addr+63:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := MEM[mem_addr+63:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[63:0] := MEM[mem_addr+127:mem_addr+64] +dst[127:64] := MEM[mem_addr+63:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". + "mem_addr" does not need to be aligned on any particular boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary. + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Load +
+ + + + + Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. + +dst[63:0] := a[63:0] +dst[127:64] := MEM[mem_addr+63:mem_addr] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + + Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Load +
+ + + + + Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+15:mem_addr] := a[15:0] + + SSE2 +
immintrin.h
+ Store +
+ + + + + Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + SSE2 +
immintrin.h
+ Store +
+ + + + + Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+31:mem_addr] := a[31:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + + Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary. + +FOR j := 0 to 15 + i := j*8 + IF mask[i+7] + MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 128-bits of integer data from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 128-bits of integer data from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 64-bit integer from the first element of "a" into memory. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. + +MEM[mem_addr+31:mem_addr] := a[31:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+63:mem_addr] := a[63:0] +MEM[mem_addr+127:mem_addr+64] := a[63:0] + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+63:mem_addr] := a[63:0] +MEM[mem_addr+127:mem_addr+64] := a[63:0] + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. + "mem_addr" does not need to be aligned on any particular boundary. + +MEM[mem_addr+127:mem_addr] := a[127:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +MEM[mem_addr+63:mem_addr] := a[127:64] +MEM[mem_addr+127:mem_addr+64] := a[63:0] + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store the upper double-precision (64-bit) floating-point element from "a" into memory. + +MEM[mem_addr+63:mem_addr] := a[127:64] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Store the lower double-precision (64-bit) floating-point element from "a" into memory. + +MEM[mem_addr+63:mem_addr] := a[63:0] + + + SSE2 +
emmintrin.h
+ Store +
+ + + + + Add packed 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[i+7:i] + b[i+7:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[i+15:i] + b[i+15:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed 32-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] + b[i+31:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add 64-bit integers "a" and "b", and store the result in "dst". + +dst[63:0] := a[63:0] + b[63:0] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed 64-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[31:16] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := a[i+15:i] * b[i+15:i] + dst[i+15:i] := tmp[31:16] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) + dst[i+15:i] := tmp[15:0] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst". + +dst[63:0] := a[31:0] * b[31:0] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+31:i] * b[i+31:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". + +FOR j := 0 to 15 + i := j*8 + tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +ENDFOR +FOR j := 0 to 1 + i := j*64 + dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] + dst[i+63:i+16] := 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[i+7:i] - b[i+7:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[i+15:i] - b[i+15:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[i+31:i] - b[i+31:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst". + +dst[63:0] := a[63:0] - b[63:0] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := a[63:0] + b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] + b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := a[63:0] / b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + dst[i+63:i] := a[i+63:i] / b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := a[63:0] * b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] * b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := a[63:0] - b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] - b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Arithmetic +
+ + + + + Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +ENDFOR + + + SSE2 +
emmintrin.h
+ Probability/Statistics +
+ + + + + Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +ENDFOR + + + SSE2 +
emmintrin.h
+ Probability/Statistics +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [max_float_note] + +dst[63:0] := MAX(a[63:0], b[63:0]) +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [min_float_note] + +dst[63:0] := MIN(a[63:0], b[63:0]) +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Special Math Functions +
+ + + + + Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] << (tmp*8) + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) + ELSE + dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) + ELSE + dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". + +tmp := imm8[7:0] +IF tmp > 15 + tmp := 16 +FI +dst[127:0] := a[127:0] >> (tmp*8) + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF imm8[7:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF count[63:0] > 15 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[7:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF count[63:0] > 31 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF imm8[7:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF count[63:0] > 63 + dst[i+63:i] := 0 + ELSE + dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) + FI +ENDFOR + + + SSE2 +
emmintrin.h
+ Shift +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[127:0] := (a[127:0] AND b[127:0]) + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". + +dst[127:0] := ((NOT a[127:0]) AND b[127:0]) + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[127:0] := (a[127:0] OR b[127:0]) + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". + +dst[127:0] := (a[127:0] XOR b[127:0]) + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] OR b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +ENDFOR + + + SSE2 +
emmintrin.h
+ Logical +
+ + + + + Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched. + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ( a[i+7:i] < b[i+7:i] ) ? 0xFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched. + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ( a[i+15:i] < b[i+15:i] ) ? 0xFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched. + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (!(a[63:0] < b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (!(a[63:0] <= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (!(a[63:0] > b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := (!(a[63:0] >= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". + FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (!(a[i+63:i] < b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (!(a[i+63:i] <= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (!(a[i+63:i] > b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := (!(a[i+63:i] >= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] < b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] <= b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] > b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] >= b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). + RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] < b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] <= b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] > b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] >= b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + + Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. + RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0 + + + SSE2 +
emmintrin.h
+ Compare +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_Int64_To_FP64(b[63:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := j*32 + m := j*64 + dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". + +dst[31:0] := a[31:0] +dst[127:32] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy the lower 32-bit integer in "a" to "dst". + +dst[31:0] := a[31:0] + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy the lower 64-bit integer in "a" to "dst". + +dst[63:0] := a[63:0] + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy the lower 64-bit integer in "a" to "dst". + +dst[63:0] := a[63:0] + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) +ENDFOR +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_Int32(a[63:0]) + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64(a[63:0]) + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := Convert_FP64_To_FP32(b[63:0]) +dst[127:32] := a[127:32] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". + +dst[63:0] := a[63:0] + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +dst[127:64] := a[127:64] +dst[MAX:128] := 0 + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". + +dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". + +dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". + +FOR j := 0 to 1 + i := 32*j + k := 64*j + dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Convert +
+ + + + + Set packed 64-bit integers in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed 64-bit integers in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the supplied values. + +dst[31:0] := e0 +dst[63:32] := e1 +dst[95:64] := e2 +dst[127:96] := e3 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values. + +dst[15:0] := e0 +dst[31:16] := e1 +dst[47:32] := e2 +dst[63:48] := e3 +dst[79:64] := e4 +dst[95:80] := e5 +dst[111:96] := e6 +dst[127:112] := e7 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values. + +dst[7:0] := e0 +dst[15:8] := e1 +dst[23:16] := e2 +dst[31:24] := e3 +dst[39:32] := e4 +dst[47:40] := e5 +dst[55:48] := e6 +dst[63:56] := e7 +dst[71:64] := e8 +dst[79:72] := e9 +dst[87:80] := e10 +dst[95:88] := e11 +dst[103:96] := e12 +dst[111:104] := e13 +dst[119:112] := e14 +dst[127:120] := e15 + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 64-bit integer "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := a[31:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := a[15:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := a[7:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed 64-bit integers in "dst" with the supplied values in reverse order. + +dst[63:0] := e1 +dst[127:64] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + Set packed 32-bit integers in "dst" with the supplied values in reverse order. + +dst[31:0] := e3 +dst[63:32] := e2 +dst[95:64] := e1 +dst[127:96] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + Set packed 16-bit integers in "dst" with the supplied values in reverse order. + +dst[15:0] := e7 +dst[31:16] := e6 +dst[47:32] := e5 +dst[63:48] := e4 +dst[79:64] := e3 +dst[95:80] := e2 +dst[111:96] := e1 +dst[127:112] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + + + + + + + + + + + + + + + Set packed 8-bit integers in "dst" with the supplied values in reverse order. + +dst[7:0] := e15 +dst[15:8] := e14 +dst[23:16] := e13 +dst[31:24] := e12 +dst[39:32] := e11 +dst[47:40] := e10 +dst[55:48] := e9 +dst[63:56] := e8 +dst[71:64] := e7 +dst[79:72] := e6 +dst[87:80] := e5 +dst[95:88] := e4 +dst[103:96] := e3 +dst[111:104] := e2 +dst[119:112] := e1 +dst[127:120] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + Return vector of type __m128i with all elements set to zero. + +dst[MAX:0] := 0 + + + SSE2 +
emmintrin.h
+ Set +
+ + + + Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := a[63:0] +ENDFOR + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. + +dst[63:0] := e0 +dst[127:64] := e1 + + SSE2 +
emmintrin.h
+ Set +
+ + + + + Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. + +dst[63:0] := e1 +dst[127:64] := e0 + + SSE2 +
emmintrin.h
+ Set +
+ + + + Return vector of type __m128d with all elements set to zero. + +dst[MAX:0] := 0 + + + SSE2 +
emmintrin.h
+ Set +
+ + + + Copy the lower 64-bit integer in "a" to "dst". + +dst[63:0] := a[63:0] + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". + +dst[7:0] := Saturate8(a[15:0]) +dst[15:8] := Saturate8(a[31:16]) +dst[23:16] := Saturate8(a[47:32]) +dst[31:24] := Saturate8(a[63:48]) +dst[39:32] := Saturate8(a[79:64]) +dst[47:40] := Saturate8(a[95:80]) +dst[55:48] := Saturate8(a[111:96]) +dst[63:56] := Saturate8(a[127:112]) +dst[71:64] := Saturate8(b[15:0]) +dst[79:72] := Saturate8(b[31:16]) +dst[87:80] := Saturate8(b[47:32]) +dst[95:88] := Saturate8(b[63:48]) +dst[103:96] := Saturate8(b[79:64]) +dst[111:104] := Saturate8(b[95:80]) +dst[119:112] := Saturate8(b[111:96]) +dst[127:120] := Saturate8(b[127:112]) + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". + +dst[15:0] := Saturate16(a[31:0]) +dst[31:16] := Saturate16(a[63:32]) +dst[47:32] := Saturate16(a[95:64]) +dst[63:48] := Saturate16(a[127:96]) +dst[79:64] := Saturate16(b[31:0]) +dst[95:80] := Saturate16(b[63:32]) +dst[111:96] := Saturate16(b[95:64]) +dst[127:112] := Saturate16(b[127:96]) + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + + Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". + +dst[7:0] := SaturateU8(a[15:0]) +dst[15:8] := SaturateU8(a[31:16]) +dst[23:16] := SaturateU8(a[47:32]) +dst[31:24] := SaturateU8(a[63:48]) +dst[39:32] := SaturateU8(a[79:64]) +dst[47:40] := SaturateU8(a[95:80]) +dst[55:48] := SaturateU8(a[111:96]) +dst[63:56] := SaturateU8(a[127:112]) +dst[71:64] := SaturateU8(b[15:0]) +dst[79:72] := SaturateU8(b[31:16]) +dst[87:80] := SaturateU8(b[47:32]) +dst[95:88] := SaturateU8(b[63:48]) +dst[103:96] := SaturateU8(b[79:64]) +dst[111:104] := SaturateU8(b[95:80]) +dst[119:112] := SaturateU8(b[111:96]) +dst[127:120] := SaturateU8(b[127:112]) + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[j] := a[i+7] +ENDFOR +dst[MAX:16] := 0 + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". + +FOR j := 0 to 1 + i := j*64 + IF a[i+63] + dst[j] := 1 + ELSE + dst[j] := 0 + FI +ENDFOR +dst[MAX:2] := 0 + + + SSE2 +
emmintrin.h
+ Miscellaneous +
+ + + + Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Move +
+ + + + Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element. + +dst[63:0] := a[63:0] +dst[127:64] := 0 + + + SSE2 +
emmintrin.h
+ Move +
+ + + + + Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := b[63:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Move +
+ + + + + Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + +dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] +dst[31:16] := 0 + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[2:0]*16 +dst[sel+15:sel] := i[15:0] + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst". + +DEFINE SELECT4(src, control) { + CASE(control[1:0]) OF + 0: tmp[31:0] := src[31:0] + 1: tmp[31:0] := src[63:32] + 2: tmp[31:0] := src[95:64] + 3: tmp[31:0] := src[127:96] + ESAC + RETURN tmp[31:0] +} +dst[31:0] := SELECT4(a[127:0], imm8[1:0]) +dst[63:32] := SELECT4(a[127:0], imm8[3:2]) +dst[95:64] := SELECT4(a[127:0], imm8[5:4]) +dst[127:96] := SELECT4(a[127:0], imm8[7:6]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst". + +dst[63:0] := a[63:0] +dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] +dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] +dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] +dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst". + +dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] +dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] +dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] +dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[71:64] + dst[15:8] := src2[71:64] + dst[23:16] := src1[79:72] + dst[31:24] := src2[79:72] + dst[39:32] := src1[87:80] + dst[47:40] := src2[87:80] + dst[55:48] := src1[95:88] + dst[63:56] := src2[95:88] + dst[71:64] := src1[103:96] + dst[79:72] := src2[103:96] + dst[87:80] := src1[111:104] + dst[95:88] := src2[111:104] + dst[103:96] := src1[119:112] + dst[111:104] := src2[119:112] + dst[119:112] := src1[127:120] + dst[127:120] := src2[127:120] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[79:64] + dst[31:16] := src2[79:64] + dst[47:32] := src1[95:80] + dst[63:48] := src2[95:80] + dst[79:64] := src1[111:96] + dst[95:80] := src2[111:96] + dst[111:96] := src1[127:112] + dst[127:112] := src2[127:112] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[95:64] + dst[63:32] := src2[95:64] + dst[95:64] := src1[127:96] + dst[127:96] := src2[127:96] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { + dst[7:0] := src1[7:0] + dst[15:8] := src2[7:0] + dst[23:16] := src1[15:8] + dst[31:24] := src2[15:8] + dst[39:32] := src1[23:16] + dst[47:40] := src2[23:16] + dst[55:48] := src1[31:24] + dst[63:56] := src2[31:24] + dst[71:64] := src1[39:32] + dst[79:72] := src2[39:32] + dst[87:80] := src1[47:40] + dst[95:88] := src2[47:40] + dst[103:96] := src1[55:48] + dst[111:104] := src2[55:48] + dst[119:112] := src1[63:56] + dst[127:120] := src2[63:56] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { + dst[15:0] := src1[15:0] + dst[31:16] := src2[15:0] + dst[47:32] := src1[31:16] + dst[63:48] := src2[31:16] + dst[79:64] := src1[47:32] + dst[95:80] := src2[47:32] + dst[111:96] := src1[63:48] + dst[127:112] := src2[63:48] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { + dst[31:0] := src1[31:0] + dst[63:32] := src2[31:0] + dst[95:64] := src1[63:32] + dst[127:96] := src2[63:32] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[127:64] + dst[127:64] := src2[127:64] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". + +DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { + dst[63:0] := src1[63:0] + dst[127:64] := src2[63:0] + RETURN dst[127:0] +} +dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + + Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst". + +dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] + + + SSE2 +
emmintrin.h
+ Swizzle +
+ + + + + Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := SQRT(b[63:0]) +dst[127:64] := a[127:64] + + + SSE2 +
emmintrin.h
+ Elementary Math Functions +
+ + + + Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SQRT(a[i+63:i]) +ENDFOR + + + SSE2 +
emmintrin.h
+ Elementary Math Functions +
+ + + + Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2 +
emmintrin.h
+ Cast +
+ + + + Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2 +
emmintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2 +
emmintrin.h
+ Cast +
+ + + + Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2 +
emmintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2 +
emmintrin.h
+ Cast +
+ + + + Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. + SSE2 +
emmintrin.h
+ Cast +
+ + + + + + + Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF ((j & 1) == 0) + dst[i+31:i] := a[i+31:i] - b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + b[i+31:i] + FI +ENDFOR + + + SSE3 +
pmmintrin.h
+ Arithmetic +
+ + + + + Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF ((j & 1) == 0) + dst[i+63:i] := a[i+63:i] - b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + b[i+63:i] + FI +ENDFOR + + + SSE3 +
pmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[63:0] := a[127:64] + a[63:0] +dst[127:64] := b[127:64] + b[63:0] + + + SSE3 +
pmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[31:0] := a[63:32] + a[31:0] +dst[63:32] := a[127:96] + a[95:64] +dst[95:64] := b[63:32] + b[31:0] +dst[127:96] := b[127:96] + b[95:64] + + + SSE3 +
pmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[63:0] := a[63:0] - a[127:64] +dst[127:64] := b[63:0] - b[127:64] + + + SSE3 +
pmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". + +dst[31:0] := a[31:0] - a[63:32] +dst[63:32] := a[95:64] - a[127:96] +dst[95:64] := b[31:0] - b[63:32] +dst[127:96] := b[95:64] - b[127:96] + + + SSE3 +
pmmintrin.h
+ Arithmetic +
+ + + + Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE3 +
pmmintrin.h
+ Load +
+ + + + Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". + +dst[63:0] := MEM[mem_addr+63:mem_addr] +dst[127:64] := MEM[mem_addr+63:mem_addr] + + + SSE3 +
pmmintrin.h
+ Load +
+ + + + Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". + +dst[63:0] := a[63:0] +dst[127:64] := a[63:0] + + + SSE3 +
pmmintrin.h
+ Move +
+ + + + Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[63:32] +dst[63:32] := a[63:32] +dst[95:64] := a[127:96] +dst[127:96] := a[127:96] + + + SSE3 +
pmmintrin.h
+ Move +
+ + + + Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". + +dst[31:0] := a[31:0] +dst[63:32] := a[31:0] +dst[95:64] := a[95:64] +dst[127:96] := a[95:64] + + + SSE3 +
pmmintrin.h
+ Move +
+ + + + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF imm8[j] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF imm8[j] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + IF mask[i+63] + dst[i+63:i] := b[i+63:i] + ELSE + dst[i+63:i] := a[i+63:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". + +FOR j := 0 to 3 + i := j*32 + IF mask[i+31] + dst[i+31:i] := b[i+31:i] + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + IF mask[i+7] + dst[i+7:i] := b[i+7:i] + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". + +FOR j := 0 to 7 + i := j*16 + IF imm8[j] + dst[i+15:i] := b[i+15:i] + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". + +dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". + +dst[7:0] := (a[127:0] >> (imm8[3:0] * 8))[7:0] +dst[31:8] := 0 + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". + +dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". + +dst[63:0] := (a[127:0] >> (imm8[0] * 64))[63:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). + +tmp2[127:0] := a[127:0] +CASE (imm8[7:6]) OF +0: tmp1[31:0] := b[31:0] +1: tmp1[31:0] := b[63:32] +2: tmp1[31:0] := b[95:64] +3: tmp1[31:0] := b[127:96] +ESAC +CASE (imm8[5:4]) OF +0: tmp2[31:0] := tmp1[31:0] +1: tmp2[63:32] := tmp1[31:0] +2: tmp2[95:64] := tmp1[31:0] +3: tmp2[127:96] := tmp1[31:0] +ESAC +FOR j := 0 to 3 + i := j*32 + IF imm8[j%8] + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := tmp2[i+31:i] + FI +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[3:0]*8 +dst[sel+7:sel] := i[7:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[1:0]*32 +dst[sel+31:sel] := i[31:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". + +dst[127:0] := a[127:0] +sel := imm8[0]*64 +dst[sel+63:sel] := i[63:0] + + + SSE4.1 +
smmintrin.h
+ Swizzle +
+ + + + + + Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". + +DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { + FOR j := 0 to 1 + i := j*64 + IF imm8[(4+j)%8] + temp[i+63:i] := a[i+63:i] * b[i+63:i] + ELSE + temp[i+63:i] := 0.0 + FI + ENDFOR + + sum[63:0] := temp[127:64] + temp[63:0] + + FOR j := 0 to 1 + i := j*64 + IF imm8[j%8] + tmpdst[i+63:i] := sum[63:0] + ELSE + tmpdst[i+63:i] := 0.0 + FI + ENDFOR + RETURN tmpdst[127:0] +} +dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) + + + SSE4.1 +
smmintrin.h
+ Arithmetic +
+ + + + + + Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". + +DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { + FOR j := 0 to 3 + i := j*32 + IF imm8[(4+j)%8] + temp[i+31:i] := a[i+31:i] * b[i+31:i] + ELSE + temp[i+31:i] := 0 + FI + ENDFOR + + sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) + + FOR j := 0 to 3 + i := j*32 + IF imm8[j%8] + tmpdst[i+31:i] := sum[31:0] + ELSE + tmpdst[i+31:i] := 0 + FI + ENDFOR + RETURN tmpdst[127:0] +} +dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) + + + SSE4.1 +
smmintrin.h
+ Arithmetic +
+ + + + + Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Arithmetic +
+ + + + + Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". + +FOR j := 0 to 3 + i := j*32 + tmp[63:0] := a[i+31:i] * b[i+31:i] + dst[i+31:i] := tmp[31:0] +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Arithmetic +
+ + Miscellaneous + + + + + Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". + Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". + +DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { + a_offset := imm8[2]*32 + b_offset := imm8[1:0]*32 + FOR j := 0 to 7 + i := j*8 + k := a_offset+i + l := b_offset + tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ + ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) + ENDFOR + RETURN tmp[127:0] +} +dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) + + + SSE4.1 +
smmintrin.h
+ Arithmetic +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". + [round_note] + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ROUND(a[i+63:i], rounding) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := FLOOR(a[i+63:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := CEIL(a[i+63:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". + [round_note] + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ROUND(a[i+31:i], rounding) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := FLOOR(a[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := CEIL(a[i+31:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + + Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + [round_note] + +dst[63:0] := ROUND(b[63:0], rounding) +dst[127:64] := a[127:64] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := FLOOR(b[63:0]) +dst[127:64] := a[127:64] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". + +dst[63:0] := CEIL(b[63:0]) +dst[127:64] := a[127:64] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + + Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + [round_note] + +dst[31:0] := ROUND(b[31:0], rounding) +dst[127:32] := a[127:32] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := FLOOR(b[31:0]) +dst[127:32] := a[127:32] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + + + + Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". + +dst[31:0] := CEIL(b[31:0]) +dst[127:32] := a[127:32] + + + SSE4.1 +
smmintrin.h
+ Special Math Functions +
+ + Miscellaneous + + + + Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". + +dst[15:0] := SaturateU16(a[31:0]) +dst[31:16] := SaturateU16(a[63:32]) +dst[47:32] := SaturateU16(a[95:64]) +dst[63:48] := SaturateU16(a[127:96]) +dst[79:64] := SaturateU16(b[31:0]) +dst[95:80] := SaturateU16(b[63:32]) +dst[111:96] := SaturateU16(b[95:64]) +dst[127:112] := SaturateU16(b[127:96]) + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + l := j*16 + dst[l+15:l] := SignExtend16(a[i+7:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[i+31:i] := SignExtend32(a[k+7:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 8*j + dst[i+63:i] := SignExtend64(a[k+7:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[i+31:i] := SignExtend32(a[k+15:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[i+63:i] := SignExtend64(a[k+15:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[i+63:i] := SignExtend64(a[k+31:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + l := j*16 + dst[l+15:l] := ZeroExtend16(a[i+7:i]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 8*j + dst[i+31:i] := ZeroExtend32(a[k+7:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 8*j + dst[i+63:i] := ZeroExtend64(a[k+7:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". + +FOR j := 0 to 3 + i := 32*j + k := 16*j + dst[i+31:i] := ZeroExtend32(a[k+15:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 16*j + dst[i+63:i] := ZeroExtend64(a[k+15:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". + +FOR j := 0 to 1 + i := 64*j + k := 32*j + dst[i+63:i] := ZeroExtend64(a[k+31:k]) +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Convert +
+ + + + + Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE4.1 +
smmintrin.h
+ Compare +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. + +IF ((a[127:0] AND b[127:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[127:0]) AND b[127:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +RETURN ZF + + + SSE4.1 +
smmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. + +IF ((a[127:0] AND b[127:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[127:0]) AND b[127:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +RETURN CF + + + SSE4.1 +
smmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +IF ((a[127:0] AND b[127:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[127:0]) AND b[127:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + SSE4.1 +
smmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0. + +IF ((a[127:0] AND mask[127:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +dst := ZF + + + SSE4.1 +
smmintrin.h
+ Logical +
+ + + + + Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. + +IF ((a[127:0] AND mask[127:0]) == 0) + ZF := 1 +ELSE + ZF := 0 +FI +IF (((NOT a[127:0]) AND mask[127:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +IF (ZF == 0 && CF == 0) + dst := 1 +ELSE + dst := 0 +FI + + + SSE4.1 +
smmintrin.h
+ Logical +
+ + + + Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0. + +FOR j := 0 to 127 + tmp[j] := 1 +ENDFOR +IF (((NOT a[127:0]) AND tmp[127:0]) == 0) + CF := 1 +ELSE + CF := 0 +FI +dst := CF + + + + SSE4.1 +
smmintrin.h
+ Logical +
+ + + + Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst". + +index[2:0] := 0 +min[15:0] := a[15:0] +FOR j := 0 to 7 + i := j*16 + IF a[i+15:i] < min[15:0] + index[2:0] := j + min[15:0] := a[i+15:i] + FI +ENDFOR +dst[15:0] := min[15:0] +dst[18:16] := index[2:0] +dst[127:19] := 0 + + + SSE4.1 +
smmintrin.h
+ Miscellaneous +
+ + + + Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint. + "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. + +dst[127:0] := MEM[mem_addr+127:mem_addr] + + + SSE4.1 +
smmintrin.h
+ Load +
+ + + + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst". + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF a[m+size-1:m] == 0 + aInvalid := 1 + FI + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +bInvalid := 0 +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + IF bInvalid // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +IF imm8[6] // byte / word mask + FOR i := 0 to UpperBound + j := i*size + IF IntRes2[i] + dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF) + ELSE + dst[j+size-1:j] := 0 + FI + ENDFOR +ELSE // bit mask + dst[UpperBound:0] := IntRes2[UpperBound:0] + dst[127:UpperBound+1] := 0 +FI + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst". + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF a[m+size-1:m] == 0 + aInvalid := 1 + FI + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +bInvalid := 0 +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + IF bInvalid // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +IF imm8[6] // most significant bit + tmp := UpperBound + dst := tmp + DO WHILE ((tmp >= 0) AND a[tmp] == 0) + tmp := tmp - 1 + dst := tmp + OD +ELSE // least significant bit + tmp := 0 + dst := tmp + DO WHILE ((tmp <= UpperBound) AND a[tmp] == 0) + tmp := tmp + 1 + dst := tmp + OD +FI + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +bInvalid := 0 +FOR j := 0 to UpperBound + n := j*size + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI +ENDFOR +dst := bInvalid + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF a[m+size-1:m] == 0 + aInvalid := 1 + FI + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +bInvalid := 0 +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + IF bInvalid // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +dst := (IntRes2 != 0) + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +aInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + IF a[m+size-1:m] == 0 + aInvalid := 1 + FI +ENDFOR +dst := aInvalid + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF a[m+size-1:m] == 0 + aInvalid := 1 + FI + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +bInvalid := 0 +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + IF bInvalid // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +dst := IntRes2[0] + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF a[m+size-1:m] == 0 + aInvalid := 1 + FI + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +bInvalid := 0 +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF b[n+size-1:n] == 0 + bInvalid := 1 + FI + IF bInvalid // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +dst := (IntRes2 == 0) AND bInvalid + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst". + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF i == la + aInvalid := 1 + FI + IF j == lb + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF i >= lb // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +IF imm8[6] // byte / word mask + FOR i := 0 to UpperBound + j := i*size + IF IntRes2[i] + dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF) + ELSE + dst[j+size-1:j] := 0 + FI + ENDFOR +ELSE // bit mask + dst[UpperBound:0] := IntRes2[UpperBound:0] + dst[127:UpperBound+1] := 0 +FI + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst". + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF i == la + aInvalid := 1 + FI + IF j == lb + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF i >= lb // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +IF imm8[6] // most significant bit + tmp := UpperBound + dst := tmp + DO WHILE ((tmp >= 0) AND a[tmp] == 0) + tmp := tmp - 1 + dst := tmp + OD +ELSE // least significant bit + tmp := 0 + dst := tmp + DO WHILE ((tmp <= UpperBound) AND a[tmp] == 0) + tmp := tmp + 1 + dst := tmp + OD +FI + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +dst := (lb <= UpperBound) + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF i == la + aInvalid := 1 + FI + IF j == lb + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF i >= lb // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +dst := (IntRes2 != 0) + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +dst := (la <= UpperBound) + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF i == la + aInvalid := 1 + FI + IF j == lb + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF i >= lb // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +dst := IntRes2[0] + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + + + + Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise. + [strcmp_note] + +size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters +UpperBound := (128 / size) - 1 +BoolRes := 0 +// compare all characters +aInvalid := 0 +bInvalid := 0 +FOR i := 0 to UpperBound + m := i*size + FOR j := 0 to UpperBound + n := j*size + BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 + + // invalidate characters after EOS + IF i == la + aInvalid := 1 + FI + IF j == lb + bInvalid := 1 + FI + + // override comparisons for invalid characters + CASE (imm8[3:2]) OF + 0: // equal any + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 1: // ranges + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + FI + 2: // equal each + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + 3: // equal ordered + IF (!aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 0 + ELSE IF (aInvalid && !bInvalid) + BoolRes.word[i].bit[j] := 1 + ELSE IF (aInvalid && bInvalid) + BoolRes.word[i].bit[j] := 1 + FI + ESAC + ENDFOR +ENDFOR +// aggregate results +CASE (imm8[3:2]) OF +0: // equal any + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] + ENDFOR + ENDFOR +1: // ranges + IntRes1 := 0 + FOR i := 0 to UpperBound + FOR j := 0 to UpperBound + IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) + j += 2 + ENDFOR + ENDFOR +2: // equal each + IntRes1 := 0 + FOR i := 0 to UpperBound + IntRes1[i] := BoolRes.word[i].bit[i] + ENDFOR +3: // equal ordered + IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) + FOR i := 0 to UpperBound + k := i + FOR j := 0 to UpperBound-i + IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] + k := k+1 + ENDFOR + ENDFOR +ESAC +// optionally negate results +FOR i := 0 to UpperBound + IF imm8[4] + IF imm8[5] // only negate valid + IF i >= lb // invalid, don't negate + IntRes2[i] := IntRes1[i] + ELSE // valid, negate + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // negate all + IntRes2[i] := -1 XOR IntRes1[i] + FI + ELSE // don't negate + IntRes2[i] := IntRes1[i] + FI +ENDFOR +// output +dst := (IntRes2 == 0) AND (lb > UpperBound) + + + SSE4.2 +
nmmintrin.h
+ String Compare +
+ + + + + Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". + +FOR j := 0 to 1 + i := j*64 + dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 +ENDFOR + + + SSE4.2 +
nmmintrin.h
+ Compare +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst". + tmp1[7:0] := v[0:7] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[39:0] := tmp1[7:0] << 32 +tmp4[39:0] := tmp2[31:0] << 8 +tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0] +tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + SSE4.2 +
nmmintrin.h
+ Cryptography +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst". + tmp1[15:0] := v[0:15] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[47:0] := tmp1[15:0] << 32 +tmp4[47:0] := tmp2[31:0] << 16 +tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0] +tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + SSE4.2 +
nmmintrin.h
+ Cryptography +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst". + tmp1[31:0] := v[0:31] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[63:0] := tmp1[31:0] << 32 +tmp4[63:0] := tmp2[31:0] << 32 +tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0] +tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + SSE4.2 +
nmmintrin.h
+ Cryptography +
+ + + + + Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst". + tmp1[63:0] := v[0:63] // bit reflection +tmp2[31:0] := crc[0:31] // bit reflection +tmp3[95:0] := tmp1[31:0] << 32 +tmp4[95:0] := tmp2[63:0] << 64 +tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0] +tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 +dst[31:0] := tmp6[0:31] // bit reflection + + + SSE4.2 +
nmmintrin.h
+ Cryptography +
+ + + + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 7 + i := j*8 + dst[i+7:i] := ABS(Int(a[i+7:i])) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 15 + i := j*8 + dst[i+7:i] := ABS(a[i+7:i]) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := ABS(Int(a[i+15:i])) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := ABS(a[i+15:i]) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 1 + i := j*32 + dst[i+31:i] := ABS(a[i+31:i]) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Special Math Functions +
+ + + + Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". + +FOR j := 0 to 3 + i := j*32 + dst[i+31:i] := ABS(a[i+31:i]) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Special Math Functions +
+ + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". + +FOR j := 0 to 15 + i := j*8 + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[3:0] := b[i+3:i] + dst[i+7:i] := a[index*8+7:index*8] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Swizzle +
+ + + + + Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". + +FOR j := 0 to 7 + i := j*8 + IF b[i+7] == 1 + dst[i+7:i] := 0 + ELSE + index[2:0] := b[i+2:i] + dst[i+7:i] := a[index*8+7:index*8] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Swizzle +
+ + + + + + Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". + +tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +dst[127:0] := tmp[127:0] + + + SSSE3 +
tmmintrin.h
+ Miscellaneous +
+ + + + + + Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". + +tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) +dst[63:0] := tmp[63:0] + + + SSSE3 +
tmmintrin.h
+ Miscellaneous +
+ + + + + Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". + +dst[15:0] := a[31:16] + a[15:0] +dst[31:16] := a[63:48] + a[47:32] +dst[47:32] := a[95:80] + a[79:64] +dst[63:48] := a[127:112] + a[111:96] +dst[79:64] := b[31:16] + b[15:0] +dst[95:80] := b[63:48] + b[47:32] +dst[111:96] := b[95:80] + b[79:64] +dst[127:112] := b[127:112] + b[111:96] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". + +dst[15:0] := Saturate16(a[31:16] + a[15:0]) +dst[31:16] := Saturate16(a[63:48] + a[47:32]) +dst[47:32] := Saturate16(a[95:80] + a[79:64]) +dst[63:48] := Saturate16(a[127:112] + a[111:96]) +dst[79:64] := Saturate16(b[31:16] + b[15:0]) +dst[95:80] := Saturate16(b[63:48] + b[47:32]) +dst[111:96] := Saturate16(b[95:80] + b[79:64]) +dst[127:112] := Saturate16(b[127:112] + b[111:96]) + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". + +dst[31:0] := a[63:32] + a[31:0] +dst[63:32] := a[127:96] + a[95:64] +dst[95:64] := b[63:32] + b[31:0] +dst[127:96] := b[127:96] + b[95:64] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". + +dst[15:0] := a[31:16] + a[15:0] +dst[31:16] := a[63:48] + a[47:32] +dst[47:32] := b[31:16] + b[15:0] +dst[63:48] := b[63:48] + b[47:32] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". + +dst[31:0] := a[63:32] + a[31:0] +dst[63:32] := b[63:32] + b[31:0] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". + +dst[15:0] := Saturate16(a[31:16] + a[15:0]) +dst[31:16] := Saturate16(a[63:48] + a[47:32]) +dst[47:32] := Saturate16(b[31:16] + b[15:0]) +dst[63:48] := Saturate16(b[63:48] + b[47:32]) + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". + +dst[15:0] := a[15:0] - a[31:16] +dst[31:16] := a[47:32] - a[63:48] +dst[47:32] := a[79:64] - a[95:80] +dst[63:48] := a[111:96] - a[127:112] +dst[79:64] := b[15:0] - b[31:16] +dst[95:80] := b[47:32] - b[63:48] +dst[111:96] := b[79:64] - b[95:80] +dst[127:112] := b[111:96] - b[127:112] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". + +dst[15:0] := Saturate16(a[15:0] - a[31:16]) +dst[31:16] := Saturate16(a[47:32] - a[63:48]) +dst[47:32] := Saturate16(a[79:64] - a[95:80]) +dst[63:48] := Saturate16(a[111:96] - a[127:112]) +dst[79:64] := Saturate16(b[15:0] - b[31:16]) +dst[95:80] := Saturate16(b[47:32] - b[63:48]) +dst[111:96] := Saturate16(b[79:64] - b[95:80]) +dst[127:112] := Saturate16(b[111:96] - b[127:112]) + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". + +dst[31:0] := a[31:0] - a[63:32] +dst[63:32] := a[95:64] - a[127:96] +dst[95:64] := b[31:0] - b[63:32] +dst[127:96] := b[95:64] - b[127:96] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". + +dst[15:0] := a[15:0] - a[31:16] +dst[31:16] := a[47:32] - a[63:48] +dst[47:32] := b[15:0] - b[31:16] +dst[63:48] := b[47:32] - b[63:48] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". + +dst[31:0] := a[31:0] - a[63:32] +dst[63:32] := b[31:0] - b[63:32] + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". + +dst[15:0] := Saturate16(a[15:0] - a[31:16]) +dst[31:16] := Saturate16(a[47:32] - a[63:48]) +dst[47:32] := Saturate16(b[15:0] - b[31:16]) +dst[63:48] := Saturate16(b[47:32] - b[63:48]) + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". + +FOR j := 0 to 7 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". + +FOR j := 0 to 3 + i := j*16 + dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". + +FOR j := 0 to 7 + i := j*16 + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". + +FOR j := 0 to 3 + i := j*16 + tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 + dst[i+15:i] := tmp[16:1] +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 15 + i := j*8 + IF b[i+7:i] < 0 + dst[i+7:i] := -(a[i+7:i]) + ELSE IF b[i+7:i] == 0 + dst[i+7:i] := 0 + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 7 + i := j*16 + IF b[i+15:i] < 0 + dst[i+15:i] := -(a[i+15:i]) + ELSE IF b[i+15:i] == 0 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 3 + i := j*32 + IF b[i+31:i] < 0 + dst[i+31:i] := -(a[i+31:i]) + ELSE IF b[i+31:i] == 0 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 7 + i := j*8 + IF b[i+7:i] < 0 + dst[i+7:i] := -(a[i+7:i]) + ELSE IF b[i+7:i] == 0 + dst[i+7:i] := 0 + ELSE + dst[i+7:i] := a[i+7:i] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 3 + i := j*16 + IF b[i+15:i] < 0 + dst[i+15:i] := -(a[i+15:i]) + ELSE IF b[i+15:i] == 0 + dst[i+15:i] := 0 + ELSE + dst[i+15:i] := a[i+15:i] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. + +FOR j := 0 to 1 + i := j*32 + IF b[i+31:i] < 0 + dst[i+31:i] := -(a[i+31:i]) + ELSE IF b[i+31:i] == 0 + dst[i+31:i] := 0 + ELSE + dst[i+31:i] := a[i+31:i] + FI +ENDFOR + + + SSSE3 +
tmmintrin.h
+ Arithmetic +
+ + + + + + Copy the current 64-bit value of the processor's time-stamp counter into "dst". + dst[63:0] := TimeStampCounter + + + TSC +
immintrin.h
+ General Support +
+ + + + + Mark the start of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a transactional region, subsequent loads are not added to the read set of the transaction. If this is used inside a suspend load address tracking region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP. + + TSXLDTRK +
immintrin.h
+ Miscellaneous +
+ + + Mark the end of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a suspend load address tracking region it will end the suspend region and all following load addresses will be added to the transaction read set. If this is used inside an active transaction but not in a suspend region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP. + + TSXLDTRK +
immintrin.h
+ Miscellaneous +
+ + + + + + Clear the user interrupt flag (UIF). + + UINTR +
immintrin.h
+ General Support +
+ + + + Send user interprocessor interrupts specified in unsigned 64-bit integer "__a". + + UINTR +
immintrin.h
+ General Support +
+ + + + Sets the user interrupt flag (UIF). + + UINTR +
immintrin.h
+ General Support +
+ + + + Store the current user interrupt flag (UIF) in unsigned 8-bit integer "dst". + + UINTR +
immintrin.h
+ General Support +
+ + + + + Reads the contents of a 64-bit MSR specified in "__A" into "dst". + DEST := MSR[__A] + + + USER_MSR +
x86gprintrin.h
+ General Support +
+ + + + + Writes the contents of "__B" into the 64-bit MSR specified in "__A". + MSR[__A] := __B + + + USER_MSR +
x86gprintrin.h
+ General Support +
+ + + + + Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." + FOR j := 0 to 1 + i := j*128 + a[i+127:i] := ShiftRows(a[i+127:i]) + a[i+127:i] := SubBytes(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:256] := 0 + + + VAES + AVX512VL +
immintrin.h
+ Cryptography +
+ + + + + Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." + FOR j := 0 to 1 + i := j*128 + a[i+127:i] := ShiftRows(a[i+127:i]) + a[i+127:i] := SubBytes(a[i+127:i]) + a[i+127:i] := MixColumns(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:256] := 0 + + + VAES + AVX512VL +
immintrin.h
+ Cryptography +
+ + + + + Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". + FOR j := 0 to 1 + i := j*128 + a[i+127:i] := InvShiftRows(a[i+127:i]) + a[i+127:i] := InvSubBytes(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:256] := 0 + + + VAES + AVX512VL +
immintrin.h
+ Cryptography +
+ + + + + Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". + FOR j := 0 to 1 + i := j*128 + a[i+127:i] := InvShiftRows(a[i+127:i]) + a[i+127:i] := InvSubBytes(a[i+127:i]) + a[i+127:i] := InvMixColumns(a[i+127:i]) + dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] +ENDFOR +dst[MAX:256] := 0 + + + VAES + AVX512VL +
immintrin.h
+ Cryptography +
+ + + + + + + + Carry-less multiplication of one quadword of + 'b' by one quadword of 'c', stores + the 128-bit result in 'dst'. The immediate 'Imm8' is + used to determine which quadwords of 'b' + and 'c' should be used. + +DEFINE PCLMUL128(X,Y) { + FOR i := 0 to 63 + TMP[i] := X[ 0 ] and Y[ i ] + FOR j := 1 to i + TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) + ENDFOR + DEST[ i ] := TMP[ i ] + ENDFOR + FOR i := 64 to 126 + TMP[i] := 0 + FOR j := i - 63 to 63 + TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) + ENDFOR + DEST[ i ] := TMP[ i ] + ENDFOR + DEST[127] := 0 + RETURN DEST // 128b vector +} +FOR i := 0 to 1 + IF Imm8[0] == 0 + TEMP1 := b.m128[i].qword[0] + ELSE + TEMP1 := b.m128[i].qword[1] + FI + IF Imm8[4] == 0 + TEMP2 := c.m128[i].qword[0] + ELSE + TEMP2 := c.m128[i].qword[1] + FI + dst.m128[i] := PCLMUL128(TEMP1, TEMP2) +ENDFOR +dst[MAX:256] := 0 + + + VPCLMULQDQ + AVX512VL +
immintrin.h
+ Application-Targeted +
+ + + + + + + + Carry-less multiplication of one quadword of + 'b' by one quadword of 'c', stores + the 128-bit result in 'dst'. The immediate 'Imm8' is + used to determine which quadwords of 'b' + and 'c' should be used. + +DEFINE PCLMUL128(X,Y) { + FOR i := 0 to 63 + TMP[i] := X[ 0 ] and Y[ i ] + FOR j := 1 to i + TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) + ENDFOR + DEST[ i ] := TMP[ i ] + ENDFOR + FOR i := 64 to 126 + TMP[i] := 0 + FOR j := i - 63 to 63 + TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) + ENDFOR + DEST[ i ] := TMP[ i ] + ENDFOR + DEST[127] := 0 + RETURN DEST // 128b vector +} +FOR i := 0 to 3 + IF Imm8[0] == 0 + TEMP1 := b.m128[i].qword[0] + ELSE + TEMP1 := b.m128[i].qword[1] + FI + IF Imm8[4] == 0 + TEMP2 := c.m128[i].qword[0] + ELSE + TEMP2 := c.m128[i].qword[1] + FI + dst.m128[i] := PCLMUL128(TEMP1, TEMP2) +ENDFOR +dst[MAX:512] := 0 + + + VPCLMULQDQ +
immintrin.h
+ Application-Targeted +
+ + + + + + + Directs the processor to enter an implementation-dependent optimized state until the TSC reaches or exceeds the value specified in "counter". Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared. + + WAITPKG +
immintrin.h
+ Miscellaneous +
+ + + + + Directs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The instruction wakes up when the TSC reaches or exceeds the value specified in "counter" (if the monitoring hardware did not trigger beforehand). Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared. + + WAITPKG +
immintrin.h
+ Miscellaneous +
+ + + + Sets up a linear address range to be + monitored by hardware and activates the + monitor. The address range should be a writeback + memory caching type. The address is + contained in "a". + + WAITPKG +
immintrin.h
+ Miscellaneous +
+ + + + + + Write back and do not flush internal caches. + Initiate writing-back without flushing of external + caches. + + WBNOINVD +
immintrin.h
+ Miscellaneous +
+ + + + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE + XSAVEC +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE + XSAVEC +
immintrin.h
+ OS-Targeted +
+ + + + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + 2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE + XSAVEOPT +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + 2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE + XSAVEOPT +
immintrin.h
+ OS-Targeted +
+ + + + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE + XSS +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE + XSS +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + st_mask := mem_addr.HEADER.XSTATE_BV[62:0] +FOR i := 0 to 62 + IF (rs_mask[i] AND XCR0[i]) + IF st_mask[i] + CASE (i) OF + 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] + 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] + DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] + ESAC + ELSE + // ProcessorExtendedState := Processor Supplied Values + CASE (i) OF + 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] + ESAC + FI + FI + i := i + 1 +ENDFOR + + + XSAVE + XSS +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + st_mask := mem_addr.HEADER.XSTATE_BV[62:0] +FOR i := 0 to 62 + IF (rs_mask[i] AND XCR0[i]) + IF st_mask[i] + CASE (i) OF + 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] + 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] + DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] + ESAC + ELSE + // ProcessorExtendedState := Processor Supplied Values + CASE (i) OF + 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] + ESAC + FI + FI + i := i + 1 +ENDFOR + + + XSAVE + XSS +
immintrin.h
+ OS-Targeted +
+ + + + + + Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported. + dst[63:0] := XCR[a] + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + st_mask := mem_addr.HEADER.XSTATE_BV[62:0] +FOR i := 0 to 62 + IF (rs_mask[i] AND XCR0[i]) + IF st_mask[i] + CASE (i) OF + 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] + 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] + DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] + ESAC + ELSE + // ProcessorExtendedState := Processor Supplied Values + CASE (i) OF + 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] + ESAC + FI + FI + i := i + 1 +ENDFOR + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. + st_mask := mem_addr.HEADER.XSTATE_BV[62:0] +FOR i := 0 to 62 + IF (rs_mask[i] AND XCR0[i]) + IF st_mask[i] + CASE (i) OF + 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] + 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] + DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] + ESAC + ELSE + // ProcessorExtendedState := Processor Supplied Values + CASE (i) OF + 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] + ESAC + FI + FI + i := i + 1 +ENDFOR + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. + mask[62:0] := save_mask[62:0] AND XCR0[62:0] +FOR i := 0 to 62 + IF mask[i] + CASE (i) OF + 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] + 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] + DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] + ESAC + mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] + FI + i := i + 1 +ENDFOR + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + + + + Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported. + +XCR[a] := val[63:0] + + + XSAVE +
immintrin.h
+ OS-Targeted +
+ + +
\ No newline at end of file From 08541d8af5d4f6c8ab32c5dbc6c294b1b952389c Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 21:56:48 +0530 Subject: [PATCH 11/73] fix: update arch flags being sent to the x86 compilation command --- crates/intrinsic-test/src/x86/compile.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index e8c2262b85..8baf581596 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -6,7 +6,7 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { // -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations let mut command = CompilationCommandBuilder::new() - .add_arch_flags(vec![ + .add_arch_flags([ "avx", "avx2", "avx512f", From ccdba428e82900e105c098b11d3e98654e669e24 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 23:12:38 +0530 Subject: [PATCH 12/73] fix: set default value for varname and type fields of the parameters/return value of an intrinsic --- crates/intrinsic-test/src/x86/xml_parser.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 0b422bddb5..71788785ef 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -39,9 +39,9 @@ struct XMLIntrinsic { #[derive(Debug, PartialEq, Clone, Deserialize)] pub struct Parameter { - #[serde(rename = "@varname")] + #[serde(rename = "@varname", default)] pub var_name: String, - #[serde(rename = "@type")] + #[serde(rename = "@type", default)] pub type_data: String, #[serde(rename = "@etype", default)] pub etype: String, From a447a1d88d36471f5d3847ae8e5f33be412f6fb9 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 5 Aug 2025 23:28:46 +0530 Subject: [PATCH 13/73] fix: correcting semantical logic for setting vec_len --- crates/intrinsic-test/src/x86/types.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 542d1ad3fa..17980798ea 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -220,7 +220,7 @@ impl X86IntrinsicType { data.vec_len = match str::parse::(type_processed.as_str()) { // If bit_len is None, vec_len will be None. // Else vec_len will be (num_bits / bit_len). - Ok(num_bits) => data.bit_len.and(Some(num_bits / data.bit_len.unwrap())), + Ok(num_bits) => data.bit_len.and_then(|bit_len| Some(num_bits / bit_len)), Err(_) => None, }; } From 781135b50cf8d583153d8f18aa9c1a10e84c121d Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Fri, 5 Sep 2025 14:11:38 +0530 Subject: [PATCH 14/73] fix: more support for Mask types --- .../src/common/intrinsic_helpers.rs | 5 +++-- crates/intrinsic-test/src/x86/types.rs | 15 +++++++++------ crates/intrinsic-test/src/x86/xml_parser.rs | 16 +++++++++++++++- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 7bc1015a38..7a2a1ecdc9 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -131,7 +131,7 @@ impl IntrinsicType { if let Some(bl) = self.bit_len { bl } else { - unreachable!("") + unreachable!("{:#?}", self) } } @@ -222,7 +222,8 @@ impl IntrinsicType { match self { IntrinsicType { bit_len: Some(bit_len @ (8 | 16 | 32 | 64)), - kind: kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_)), + kind: + kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_) | TypeKind::Mask), simd_len, vec_len, .. diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 17980798ea..cf1c56f04d 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -5,6 +5,7 @@ use itertools::Itertools; use regex::Regex; use super::intrinsic::X86IntrinsicType; +use crate::common::argument::Argument; use crate::common::cli::Language; use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind}; use crate::x86::xml_parser::Parameter; @@ -18,7 +19,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { fn c_single_vector_type(&self) -> String { // matches __m128, __m256 and similar types - let re = Regex::new(r"\__m\d+\").unwrap(); + let re = Regex::new(r"__m\d+").unwrap(); if re.is_match(self.param.type_data.as_str()) { self.param.type_data.clone() } else { @@ -129,8 +130,6 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { impl X86IntrinsicType { fn from_c(s: &str) -> Result { let mut s_copy = s.to_string(); - let mut metadata: HashMap = HashMap::new(); - metadata.insert("type".to_string(), s.to_string()); s_copy = s_copy .replace("*", "") .replace("_", "") @@ -196,6 +195,9 @@ impl X86IntrinsicType { let mut etype_processed = param.etype.clone(); etype_processed.retain(|c| c.is_numeric()); + let mut type_processed = param.type_data.clone(); + type_processed.retain(|c| c.is_numeric()); + match str::parse::(etype_processed.as_str()) { Ok(value) => data.bit_len = Some(value), Err(_) => { @@ -209,14 +211,16 @@ impl X86IntrinsicType { } } + if param.type_data.matches("__mmask").next().is_some() { + data.bit_len = str::parse::(type_processed.as_str()).ok(); + } + // then check the param.type and extract numeric part if there are double // underscores. divide this number with bit-len and set this as simd-len. // Only __m types can have a simd-len. if param.type_data.matches("__m").next().is_some() && param.type_data.matches("__mmask").next().is_none() { - let mut type_processed = param.type_data.clone(); - type_processed.retain(|c| c.is_numeric()); data.vec_len = match str::parse::(type_processed.as_str()) { // If bit_len is None, vec_len will be None. // Else vec_len will be (num_bits / bit_len). @@ -235,7 +239,6 @@ impl X86IntrinsicType { // if param.etype == IMM, then it is a constant. // else it stays unchanged. data.constant |= param.etype == "IMM"; - Ok(X86IntrinsicType { data, param: param.clone(), diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 71788785ef..7465cb72d5 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -3,6 +3,7 @@ use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; use crate::x86::constraint::map_constraints; +use regex::Regex; use serde::{Deserialize, Deserializer}; use std::path::Path; @@ -96,11 +97,24 @@ fn xml_to_intrinsic( if args.iter().any(|elem| elem.is_none()) { return Err(Box::from("intrinsic isn't fully supported in this test!")); } - let args = args + let mut args = args .into_iter() .map(|e| e.unwrap()) .filter(|arg| arg.ty.ptr || arg.ty.kind != TypeKind::Void) .collect::>(); + + let mut args_test = args.iter(); + + // if one of the args has etype="MASK" and type="__md", + // then set the bit_len and vec_len accordingly + let re = Regex::new(r"__m\d+").unwrap(); + let is_mask = |arg: &Argument| arg.ty.param.etype.as_str() == "MASK"; + let is_vector = |arg: &Argument| re.is_match(arg.ty.param.type_data.as_str()); + let pos = args_test.position(|arg| is_mask(arg) && is_vector(arg)); + if let Some(index) = pos { + args[index].ty.bit_len = args[0].ty.bit_len; + } + let arguments = ArgumentList:: { args }; if let Err(message) = result { From a232857aa89ccb28a967ea0e42bb4f68e5dcffa1 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 7 Sep 2025 00:02:28 +0530 Subject: [PATCH 15/73] fix: remove unused imports --- crates/intrinsic-test/src/x86/types.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index cf1c56f04d..4d5a0a5b7c 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -1,11 +1,9 @@ -use std::collections::HashMap; use std::str::FromStr; use itertools::Itertools; use regex::Regex; use super::intrinsic::X86IntrinsicType; -use crate::common::argument::Argument; use crate::common::cli::Language; use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind}; use crate::x86::xml_parser::Parameter; From 71d4636b2d24c4e5842b5891b0afba542a7c78f3 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 7 Sep 2025 16:33:05 +0530 Subject: [PATCH 16/73] feat: implemented print_result_c in the case the target type is Mask-based --- crates/intrinsic-test/src/x86/intrinsic.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index 169394b793..a4de1e3bdb 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -83,6 +83,7 @@ impl IntrinsicDefinition for Intrinsic { TypeKind::Void => "void".to_string(), TypeKind::Float if self.results().inner_size() == 64 => "double".to_string(), TypeKind::Float if self.results().inner_size() == 32 => "float".to_string(), + TypeKind::Mask => format!("__mmask{}", self.results.bit_len.unwrap()), // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), From 372d615fd88378179583be1a88715c7031d7de15 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 7 Sep 2025 23:45:43 +0530 Subject: [PATCH 17/73] feat: implemented get_lane_function for x86 --- crates/intrinsic-test/src/x86/config.rs | 40 +++++++++++++++++++++++++ crates/intrinsic-test/src/x86/mod.rs | 4 +-- crates/intrinsic-test/src/x86/types.rs | 24 ++++++++++++++- 3 files changed, 65 insertions(+), 3 deletions(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 427ec183a9..cf831bddd0 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -14,6 +14,46 @@ pub const F16_FORMATTING_DEF: &str = r#" struct Hex(T); "#; +pub const LANE_FUNCTION_HELPERS: &str = r#" +int mm512_extract(__m512i m, int vec_len, int bit_len, int index) { + int lane_len = 128; + int max_major_index = vec_len / lane_len; + int max_minor_index = lane_len / bit_len; + + int major_index = index / max_major_index; + int minor_index = index % max_minor_index; + + __m128i lane = _mm512_extracti64x2_epi64(m, major_index); + + switch(bit_len){ + case 8: + return _mm_extract_epi8(lane, minor_index); + case 16: + return _mm_extract_epi16(lane, minor_index); + case 32: + return _mm_extract_epi32(lane, minor_index); + case 64: + return _mm_extract_epi64(lane, minor_index); + } +} + +int _mm512_extract_intrinsic_test_epi8(__m512i m, int lane) { + return mm512_extract(m, 512, 8, lane) +} + +int _mm512_extract_intrinsic_test_epi16(__m512i m, int lane) { + return mm512_extract(m, 512, 16, lane) +} + +int mm512_extract_intrinsic_test_epi16(__m512i m, int lane) { + return mm512_extract(m, 512, 16, lane) +} + +int mm512_extract_intrinsic_test_epi64(__m512i m, int lane) { + return mm512_extract(m, 512, 64, lane) +} +"#; + pub const X86_CONFIGURATIONS: &str = r#" #![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_bf16))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))] diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 5515e68385..514783a3e0 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -17,7 +17,7 @@ use crate::common::gen_rust::{ use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; use crate::common::intrinsic_helpers::TypeKind; use crate::common::{SupportedArchitectureTest, chunk_info}; -use crate::x86::config::{F16_FORMATTING_DEF, X86_CONFIGURATIONS}; +use crate::x86::config::{F16_FORMATTING_DEF, LANE_FUNCTION_HELPERS, X86_CONFIGURATIONS}; use config::build_notices; use intrinsic::X86IntrinsicType; use xml_parser::get_xml_intrinsics; @@ -137,7 +137,7 @@ impl SupportedArchitectureTest for X86ArchitectureTest { &mut main_rs, chunk_count, X86_CONFIGURATIONS, - "", + LANE_FUNCTION_HELPERS, self.intrinsics.iter().map(|i| i.name.as_str()), ) .unwrap(); diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 4d5a0a5b7c..6ca151308e 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -121,7 +121,29 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { /// Determines the get lane function for this type. fn get_lane_function(&self) -> String { - todo!("get_lane_function for X86IntrinsicType needs to be implemented!"); + let total_vector_bits: Option = self + .vec_len + .zip(self.bit_len) + .and_then(|(vec_len, bit_len)| Some(vec_len * bit_len)); + + match (self.bit_len, total_vector_bits) { + (Some(8), Some(128)) => String::from("_mm_extract_epi8"), + (Some(16), Some(128)) => String::from("_mm_extract_epi16"), + (Some(32), Some(128)) => String::from("_mm_extract_epi32"), + (Some(64), Some(128)) => String::from("_mm_extract_epi64"), + (Some(8), Some(256)) => String::from("_mm256_extract_epi8"), + (Some(16), Some(256)) => String::from("_mm256_extract_epi16"), + (Some(32), Some(256)) => String::from("_mm256_extract_epi32"), + (Some(64), Some(256)) => String::from("_mm256_extract_epi64"), + (Some(8), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi8"), + (Some(16), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi16"), + (Some(32), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi32"), + (Some(64), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi64"), + _ => unreachable!( + "invalid length for vector argument: {:?}, {:?}", + self.bit_len, self.vec_len + ), + } } } From a3de32e5e4179bea5086bb694ec428a8586818df Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 8 Sep 2025 00:47:07 +0530 Subject: [PATCH 18/73] chore: update c_prefix for mask and print_result_c for vector type --- crates/intrinsic-test/src/common/intrinsic_helpers.rs | 1 + crates/intrinsic-test/src/x86/intrinsic.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 7a2a1ecdc9..1351ca345b 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -75,6 +75,7 @@ impl TypeKind { Self::Float => "float", Self::Int(Sign::Signed) => "int", Self::Int(Sign::Unsigned) => "uint", + Self::Mask => "uint", Self::Poly => "poly", Self::Char(Sign::Signed) => "char", _ => unreachable!("Not used: {:#?}", self), diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index a4de1e3bdb..d3a01ec227 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -84,6 +84,7 @@ impl IntrinsicDefinition for Intrinsic { TypeKind::Float if self.results().inner_size() == 64 => "double".to_string(), TypeKind::Float if self.results().inner_size() == 32 => "float".to_string(), TypeKind::Mask => format!("__mmask{}", self.results.bit_len.unwrap()), + TypeKind::Vector => format!("__m{}i", self.results.bit_len.unwrap()), // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), From 13733a718ae75f4a06cb9fb19109fec2c430fe9d Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 8 Sep 2025 20:02:03 +0530 Subject: [PATCH 19/73] feat: handled extraction for 64-bit vector elements --- crates/intrinsic-test/src/x86/config.rs | 5 +++++ crates/intrinsic-test/src/x86/types.rs | 2 ++ 2 files changed, 7 insertions(+) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index cf831bddd0..bf139e5e53 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -52,6 +52,11 @@ int mm512_extract_intrinsic_test_epi16(__m512i m, int lane) { int mm512_extract_intrinsic_test_epi64(__m512i m, int lane) { return mm512_extract(m, 512, 64, lane) } + +int mm64_extract_intrinsic_test_epi32(__m64 m, int lane) { + int bit_shift_amount = lane * 32; + return _m_to_int(m >> bit_shift_amount); +} "#; pub const X86_CONFIGURATIONS: &str = r#" diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 6ca151308e..2bb1ecb9f6 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -139,6 +139,8 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { (Some(16), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi16"), (Some(32), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi32"), (Some(64), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi64"), + (Some(16), Some(64)) => String::from("_mm_extract_pi16"), + (Some(32), Some(64)) => String::from("mm64_extract_intrinsic_test_epi32"), _ => unreachable!( "invalid length for vector argument: {:?}, {:?}", self.bit_len, self.vec_len From dcca5062be466e5aec3c1e45d154052ac0a3a1e1 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 9 Sep 2025 00:54:50 +0530 Subject: [PATCH 20/73] feat: add 8x8 case for get_lane_function for 64-bit vector --- crates/intrinsic-test/src/x86/config.rs | 7 +++++++ crates/intrinsic-test/src/x86/types.rs | 1 + 2 files changed, 8 insertions(+) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index bf139e5e53..d0c5981122 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -53,6 +53,13 @@ int mm512_extract_intrinsic_test_epi64(__m512i m, int lane) { return mm512_extract(m, 512, 64, lane) } +int mm64_extract_intrinsic_test_epi8(__m64 m, int lane) { + int real_lane_shift = lane / 2; + int real_bit_shift = (lane % 2) * 8; + int result = _mm_extract_pi16(m, lane / 2); + return (result >> real_bit_shift); +} + int mm64_extract_intrinsic_test_epi32(__m64 m, int lane) { int bit_shift_amount = lane * 32; return _m_to_int(m >> bit_shift_amount); diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 2bb1ecb9f6..7e96657977 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -139,6 +139,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { (Some(16), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi16"), (Some(32), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi32"), (Some(64), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi64"), + (Some(8), Some(64)) => String::from("mm64_extract_intrinsic_test_epi8"), (Some(16), Some(64)) => String::from("_mm_extract_pi16"), (Some(32), Some(64)) => String::from("mm64_extract_intrinsic_test_epi32"), _ => unreachable!( From 180d6f0f6e2d88af5b76b41229c5fafb03e13a88 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 10 Sep 2025 00:00:07 +0530 Subject: [PATCH 21/73] debug: printing self incase print_result_c fails. --- crates/intrinsic-test/src/x86/intrinsic.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index d3a01ec227..79f119b0e0 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -83,8 +83,8 @@ impl IntrinsicDefinition for Intrinsic { TypeKind::Void => "void".to_string(), TypeKind::Float if self.results().inner_size() == 64 => "double".to_string(), TypeKind::Float if self.results().inner_size() == 32 => "float".to_string(), - TypeKind::Mask => format!("__mmask{}", self.results.bit_len.unwrap()), - TypeKind::Vector => format!("__m{}i", self.results.bit_len.unwrap()), + TypeKind::Mask => format!("__mmask{}", self.results.bit_len.expect(format!("self: {:#?}", self).as_str())), + TypeKind::Vector => format!("__m{}i", self.results.bit_len.expect(format!("self: {:#?}", self).as_str())), // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), From 838e925d32c4f0d7edb124e517f935312e5ab7bb Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 10 Sep 2025 11:28:02 +0530 Subject: [PATCH 22/73] chore: update x86 module, removed intrinsicDefinition trait, formatting updates --- Cargo.lock | 5 +- crates/intrinsic-test/src/main.rs | 1 + crates/intrinsic-test/src/x86/config.rs | 13 +- crates/intrinsic-test/src/x86/intrinsic.rs | 90 +---------- crates/intrinsic-test/src/x86/mod.rs | 165 ++++----------------- crates/intrinsic-test/src/x86/types.rs | 79 +++++++++- 6 files changed, 115 insertions(+), 238 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 26a4223271..70f09adf2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -998,17 +998,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -<<<<<<< HEAD name = "windows_x86_64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" -======= + +[[package]] name = "xml-rs" version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fd8403733700263c6eb89f192880191f1b83e332f7a20371ddcf421c4a337c7" ->>>>>>> 3283a857 (feat: updated intrinsics creation) [[package]] name = "yaml-rust" diff --git a/crates/intrinsic-test/src/main.rs b/crates/intrinsic-test/src/main.rs index d780e35160..ed3a50067d 100644 --- a/crates/intrinsic-test/src/main.rs +++ b/crates/intrinsic-test/src/main.rs @@ -20,6 +20,7 @@ fn main() { | "armv7-unknown-linux-gnueabihf" | "aarch64_be-unknown-linux-gnu" => run(ArmArchitectureTest::create(processed_cli_options)), + "x86_64-unknown-linux-gnu" => run(X86ArchitectureTest::create(processed_cli_options)), _ => std::process::exit(0), } } diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index d0c5981122..f1e9e9932e 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -1,12 +1,7 @@ -pub fn build_notices(line_prefix: &str) -> String { - format!( - "\ -{line_prefix}This is a transient test file, not intended for distribution. Some aspects of the -{line_prefix}test are derived from an XML specification, published under the same license as the -{line_prefix}`intrinsic-test` crate.\n -" - ) -} +pub const NOTICE: &str = "\ +// This is a transient test file, not intended for distribution. Some aspects of the +// test are derived from an XML specification, published under the same license as the +// `intrinsic-test` crate.\n"; // Format f16 values (and vectors containing them) in a way that is consistent with C. pub const F16_FORMATTING_DEF: &str = r#" diff --git a/crates/intrinsic-test/src/x86/intrinsic.rs b/crates/intrinsic-test/src/x86/intrinsic.rs index 79f119b0e0..1417c51ea1 100644 --- a/crates/intrinsic-test/src/x86/intrinsic.rs +++ b/crates/intrinsic-test/src/x86/intrinsic.rs @@ -1,7 +1,4 @@ -use crate::common::argument::ArgumentList; -use crate::common::indentation::Indentation; -use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; -use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, TypeKind}; +use crate::common::intrinsic_helpers::IntrinsicType; use crate::x86::xml_parser::Parameter; use std::ops::{Deref, DerefMut}; @@ -24,88 +21,3 @@ impl DerefMut for X86IntrinsicType { &mut self.data } } - -impl IntrinsicDefinition for Intrinsic { - fn arguments(&self) -> ArgumentList { - self.arguments.clone() - } - - fn results(&self) -> X86IntrinsicType { - self.results.clone() - } - - fn name(&self) -> String { - self.name.clone() - } - - /// Generates a std::cout for the intrinsics results that will match the - /// rust debug output format for the return type. The generated line assumes - /// there is an int i in scope which is the current pass number. - fn print_result_c(&self, indentation: Indentation, additional: &str) -> String { - let lanes = if self.results().num_vectors() > 1 { - (0..self.results().num_vectors()) - .map(|vector| { - format!( - r#""{ty}(" << {lanes} << ")""#, - ty = self.results().c_single_vector_type(), - lanes = (0..self.results().num_lanes()) - .map(move |idx| -> std::string::String { - format!( - "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", - cast = self.results().c_promotion(), - lane_fn = self.results().get_lane_function(), - lane = idx, - vector = vector, - ) - }) - .collect::>() - .join(r#" << ", " << "#) - ) - }) - .collect::>() - .join(r#" << ", " << "#) - } else if self.results().num_lanes() > 1 { - (0..self.results().num_lanes()) - .map(|idx| -> std::string::String { - format!( - "{cast}{lane_fn}(__return_value, {lane})", - cast = self.results().c_promotion(), - lane_fn = self.results().get_lane_function(), - lane = idx - ) - }) - .collect::>() - .join(r#" << ", " << "#) - } else { - format!( - "{promote}cast<{cast}>(__return_value)", - cast = match self.results.kind() { - TypeKind::Void => "void".to_string(), - TypeKind::Float if self.results().inner_size() == 64 => "double".to_string(), - TypeKind::Float if self.results().inner_size() == 32 => "float".to_string(), - TypeKind::Mask => format!("__mmask{}", self.results.bit_len.expect(format!("self: {:#?}", self).as_str())), - TypeKind::Vector => format!("__m{}i", self.results.bit_len.expect(format!("self: {:#?}", self).as_str())), - // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), - // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), - // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), - // TypeKind::Int(true) if self.results().inner_size() == 32 => "int".to_string(), - // TypeKind::Int(false) if self.results().inner_size() == 32 => "unsigned int".to_string(), - // TypeKind::Int(true) if self.results().inner_size() == 16 => "short".to_string(), - // TypeKind::Int(false) if self.results().inner_size() == 16 => "unsigned short".to_string(), - _ => self.results.c_scalar_type(), - }, - promote = self.results().c_promotion(), - ) - }; - - format!( - r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#, - ty = if self.results().is_simd() { - format!("{}(", self.results().c_type()) - } else { - String::from("") - }, - close = if self.results.is_simd() { ")" } else { "" }, - ) - } -} diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 514783a3e0..e73ceb5084 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -5,20 +5,12 @@ mod intrinsic; mod types; mod xml_parser; -use rayon::prelude::*; -use std::fs::{self, File}; - +use crate::common::SupportedArchitectureTest; use crate::common::cli::ProcessedCli; use crate::common::compare::compare_outputs; -use crate::common::gen_c::{write_main_cpp, write_mod_cpp}; -use crate::common::gen_rust::{ - compile_rust_programs, write_bin_cargo_toml, write_lib_cargo_toml, write_lib_rs, write_main_rs, -}; -use crate::common::intrinsic::{Intrinsic, IntrinsicDefinition}; +use crate::common::compile_c::CppCompilation; +use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; -use crate::common::{SupportedArchitectureTest, chunk_info}; -use crate::x86::config::{F16_FORMATTING_DEF, LANE_FUNCTION_HELPERS, X86_CONFIGURATIONS}; -use config::build_notices; use intrinsic::X86IntrinsicType; use xml_parser::get_xml_intrinsics; @@ -28,7 +20,30 @@ pub struct X86ArchitectureTest { } impl SupportedArchitectureTest for X86ArchitectureTest { - fn create(cli_options: ProcessedCli) -> Box { + type IntrinsicImpl = X86IntrinsicType; + + fn cli_options(&self) -> &ProcessedCli { + &self.cli_options + } + + fn intrinsics(&self) -> &[Intrinsic] { + &self.intrinsics + } + + fn cpp_compilation(&self) -> Option { + compile::build_cpp_compilation(&self.cli_options) + } + + const NOTICE: &str = config::NOTICE; + + const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h"]; + const PLATFORM_C_DEFINITIONS: &str = config::LANE_FUNCTION_HELPERS; + const PLATFORM_C_FORWARD_DECLARATIONS: &str = ""; + + const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF; + const PLATFORM_RUST_CFGS: &str = config::X86_CONFIGURATIONS; + + fn create(cli_options: ProcessedCli) -> Self { let intrinsics = get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); @@ -37,7 +52,7 @@ impl SupportedArchitectureTest for X86ArchitectureTest { // Not sure how we would compare intrinsic that returns void. .filter(|i| i.results.kind() != TypeKind::Void) .filter(|i| i.results.kind() != TypeKind::BFloat) - .filter(|i| i.arguments().args.len() > 0) + .filter(|i| i.arguments.args.len() > 0) .filter(|i| !i.arguments.iter().any(|a| a.ty.kind() == TypeKind::BFloat)) // Skip pointers for now, we would probably need to look at the return // type to work out how many elements we need to point to. @@ -47,132 +62,10 @@ impl SupportedArchitectureTest for X86ArchitectureTest { .collect::>(); intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); - Box::new(Self { + Self { intrinsics: intrinsics, cli_options: cli_options, - }) - } - - fn build_c_file(&self) -> bool { - let c_target = "x86_64"; - let platform_headers = &["immintrin.h"]; - - let (chunk_size, chunk_count) = chunk_info(self.intrinsics.len()); - - let cpp_compiler_wrapped = compile::build_cpp_compilation(&self.cli_options); - - let notice = &build_notices("// "); - fs::create_dir_all("c_programs").unwrap(); - self.intrinsics - .par_chunks(chunk_size) - .enumerate() - .map(|(i, chunk)| { - let c_filename = format!("c_programs/mod_{i}.cpp"); - let mut file = File::create(&c_filename).unwrap(); - write_mod_cpp(&mut file, notice, c_target, platform_headers, chunk).unwrap(); - - // compile this cpp file into a .o file. - // - // This is done because `cpp_compiler_wrapped` is None when - // the --generate-only flag is passed - if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { - let output = cpp_compiler - .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?; - assert!(output.status.success(), "{output:?}"); - } - - Ok(()) - }) - .collect::>() - .unwrap(); - - let mut file = File::create("c_programs/main.cpp").unwrap(); - write_main_cpp( - &mut file, - c_target, - "\n", - self.intrinsics.iter().map(|i| i.name.as_str()), - ) - .unwrap(); - - // This is done because `cpp_compiler_wrapped` is None when - // the --generate-only flag is passed - if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { - // compile this cpp file into a .o file - info!("compiling main.cpp"); - let output = cpp_compiler - .compile_object_file("main.cpp", "intrinsic-test-programs.o") - .unwrap(); - assert!(output.status.success(), "{output:?}"); - - let object_files = (0..chunk_count) - .map(|i| format!("mod_{i}.o")) - .chain(["intrinsic-test-programs.o".to_owned()]); - - let output = cpp_compiler - .link_executable(object_files, "intrinsic-test-programs") - .unwrap(); - assert!(output.status.success(), "{output:?}"); } - - true - } - - fn build_rust_file(&self) -> bool { - std::fs::create_dir_all("rust_programs/src").unwrap(); - - let architecture = if self.cli_options.target.contains("v7") { - "arm" - } else { - "aarch64" - }; - - let (chunk_size, chunk_count) = chunk_info(self.intrinsics.len()); - - let mut cargo = File::create("rust_programs/Cargo.toml").unwrap(); - write_bin_cargo_toml(&mut cargo, chunk_count).unwrap(); - - let mut main_rs = File::create("rust_programs/src/main.rs").unwrap(); - write_main_rs( - &mut main_rs, - chunk_count, - X86_CONFIGURATIONS, - LANE_FUNCTION_HELPERS, - self.intrinsics.iter().map(|i| i.name.as_str()), - ) - .unwrap(); - - let target = &self.cli_options.target; - let toolchain = self.cli_options.toolchain.as_deref(); - let linker = self.cli_options.linker.as_deref(); - - let notice = &build_notices("// "); - self.intrinsics - .par_chunks(chunk_size) - .enumerate() - .map(|(i, chunk)| { - std::fs::create_dir_all(format!("rust_programs/mod_{i}/src"))?; - - let rust_filename = format!("rust_programs/mod_{i}/src/lib.rs"); - trace!("generating `{rust_filename}`"); - let mut file = File::create(rust_filename)?; - - let cfg = X86_CONFIGURATIONS; - let definitions = F16_FORMATTING_DEF; - write_lib_rs(&mut file, architecture, notice, cfg, definitions, chunk)?; - - let toml_filename = format!("rust_programs/mod_{i}/Cargo.toml"); - trace!("generating `{toml_filename}`"); - let mut file = File::create(toml_filename).unwrap(); - - write_lib_cargo_toml(&mut file, &format!("mod_{i}"))?; - - Ok(()) - }) - .collect::>() - .unwrap(); - - compile_rust_programs(toolchain, target, linker) } fn compare_outputs(&self) -> bool { diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 7e96657977..bb7ea59dbe 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -5,6 +5,7 @@ use regex::Regex; use super::intrinsic::X86IntrinsicType; use crate::common::cli::Language; +use crate::common::indentation::Indentation; use crate::common::intrinsic_helpers::{IntrinsicType, IntrinsicTypeDefinition, Sign, TypeKind}; use crate::x86::xml_parser::Parameter; @@ -116,7 +117,83 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { // then typecast it. format!("({type_value})") } - // Look for edge cases (constexpr, literal, etc) + } + + /// Generates a std::cout for the intrinsics results that will match the + /// rust debug output format for the return type. The generated line assumes + /// there is an int i in scope which is the current pass number. + fn print_result_c(&self, indentation: Indentation, additional: &str) -> String { + let lanes = if self.num_vectors() > 1 { + (0..self.num_vectors()) + .map(|vector| { + format!( + r#""{ty}(" << {lanes} << ")""#, + ty = self.c_single_vector_type(), + lanes = (0..self.num_lanes()) + .map(move |idx| -> std::string::String { + format!( + "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", + cast = self.c_promotion(), + lane_fn = self.get_lane_function(), + lane = idx, + vector = vector, + ) + }) + .collect::>() + .join(r#" << ", " << "#) + ) + }) + .collect::>() + .join(r#" << ", " << "#) + } else if self.num_lanes() > 1 { + (0..self.num_lanes()) + .map(|idx| -> std::string::String { + format!( + "{cast}{lane_fn}(__return_value, {lane})", + cast = self.c_promotion(), + lane_fn = self.get_lane_function(), + lane = idx + ) + }) + .collect::>() + .join(r#" << ", " << "#) + } else { + format!( + "{promote}cast<{cast}>(__return_value)", + cast = match self.kind() { + TypeKind::Void => "void".to_string(), + TypeKind::Float if self.inner_size() == 64 => "double".to_string(), + TypeKind::Float if self.inner_size() == 32 => "float".to_string(), + TypeKind::Mask => format!( + "__mmask{}", + self.bit_len.expect(format!("self: {:#?}", self).as_str()) + ), + TypeKind::Vector => format!( + "__m{}i", + self.bit_len.expect(format!("self: {:#?}", self).as_str()) + ), + // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), + // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), + // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), + // TypeKind::Int(true) if self.results().inner_size() == 32 => "int".to_string(), + // TypeKind::Int(false) if self.results().inner_size() == 32 => "unsigned int".to_string(), + // TypeKind::Int(true) if self.results().inner_size() == 16 => "short".to_string(), + // TypeKind::Int(false) if self.results().inner_size() == 16 => "unsigned short".to_string(), + _ => self.c_scalar_type(), + }, + promote = self.c_promotion(), + ) + }; + + format!( + r#"{indentation}std::cout << "Result {additional}-" << i+1 << ": {ty}" << std::fixed << std::setprecision(150) << {lanes} << "{close}" << std::endl;"#, + ty = if self.is_simd() { + format!("{}(", self.c_type()) + } else { + String::from("") + }, + close = if self.is_simd() { ")" } else { "" }, + ) } /// Determines the get lane function for this type. From e3ad0e474dab06be830a837f20d790feab48113e Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 13 Sep 2025 23:04:08 +0530 Subject: [PATCH 23/73] fixed errors that caused errors with cpp file generation (un-handled edge cases for Vector and Mask types) --- crates/intrinsic-test/src/common/cli.rs | 4 +- crates/intrinsic-test/src/common/gen_rust.rs | 6 +-- .../src/common/intrinsic_helpers.rs | 44 +++++++++++++++---- crates/intrinsic-test/src/common/mod.rs | 9 ++++ crates/intrinsic-test/src/x86/mod.rs | 19 -------- crates/intrinsic-test/src/x86/types.rs | 20 +++++++++ 6 files changed, 70 insertions(+), 32 deletions(-) diff --git a/crates/intrinsic-test/src/common/cli.rs b/crates/intrinsic-test/src/common/cli.rs index beae6a4b04..f8125128ea 100644 --- a/crates/intrinsic-test/src/common/cli.rs +++ b/crates/intrinsic-test/src/common/cli.rs @@ -44,7 +44,9 @@ pub struct Cli { pub generate_only: bool, /// Pass a target the test suite - #[arg(long, default_value_t = String::from("armv7-unknown-linux-gnueabihf"))] + /// x86_64-unknown-linux-gnu + /// armv7-unknown-linux-gnueabihf + #[arg(long, default_value_t = String::from("x86_64-unknown-linux-gnu"))] pub target: String, /// Set the linker diff --git a/crates/intrinsic-test/src/common/gen_rust.rs b/crates/intrinsic-test/src/common/gen_rust.rs index d659cbc4aa..e97b745c59 100644 --- a/crates/intrinsic-test/src/common/gen_rust.rs +++ b/crates/intrinsic-test/src/common/gen_rust.rs @@ -191,7 +191,7 @@ pub fn generate_rust_test_loop( w: &mut impl std::io::Write, intrinsic: &Intrinsic, indentation: Indentation, - specializations: &[Vec], + specializations: &[Vec], passes: u32, ) -> std::io::Result<()> { let intrinsic_name = &intrinsic.name; @@ -256,7 +256,7 @@ pub fn generate_rust_test_loop( /// Generate the specializations (unique sequences of const-generic arguments) for this intrinsic. fn generate_rust_specializations( constraints: &mut impl Iterator>, -) -> Vec> { +) -> Vec> { let mut specializations = vec![vec![]]; for constraint in constraints { @@ -264,7 +264,7 @@ fn generate_rust_specializations( .flat_map(|right| { specializations.iter().map(move |left| { let mut left = left.clone(); - left.push(u8::try_from(right).unwrap()); + left.push(i64::try_from(right).unwrap()); left }) }) diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 1351ca345b..7403b81df8 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -78,6 +78,7 @@ impl TypeKind { Self::Mask => "uint", Self::Poly => "poly", Self::Char(Sign::Signed) => "char", + Self::Vector => "int", _ => unreachable!("Not used: {:#?}", self), } } @@ -155,6 +156,7 @@ impl IntrinsicType { pub fn c_scalar_type(&self) -> String { match self.kind() { TypeKind::Char(_) => String::from("char"), + TypeKind::Vector => String::from("int32_t"), _ => format!( "{prefix}{bits}_t", prefix = self.kind().c_prefix(), @@ -163,14 +165,6 @@ impl IntrinsicType { } } - pub fn rust_scalar_type(&self) -> String { - format!( - "{prefix}{bits}", - prefix = self.kind().rust_prefix(), - bits = self.inner_size() - ) - } - pub fn c_promotion(&self) -> &str { match *self { IntrinsicType { @@ -285,6 +279,29 @@ impl IntrinsicType { ))) ) } + IntrinsicType { + kind: TypeKind::Vector, + bit_len: Some(bit_len @ (128 | 256 | 512)), + simd_len, + .. + } => { + let (prefix, suffix) = match language { + Language::Rust => ("[", "]"), + Language::C => ("{", "}"), + }; + let body_indentation = indentation.nested(); + let effective_bit_len = 32; + let effective_vec_len = bit_len / effective_bit_len; + format!( + "{prefix}\n{body}\n{indentation}{suffix}", + body = (0..(simd_len.unwrap_or(1) * effective_vec_len + loads - 1)) + .format_with(",\n", |i, fmt| { + let src = value_for_array(effective_bit_len, i); + assert!(src == 0 || src.ilog2() < *bit_len); + fmt(&format_args!("{body_indentation}{src:#x}")) + }) + ) + } _ => unimplemented!("populate random: {:#?}", self), } } @@ -300,7 +317,7 @@ impl IntrinsicType { kind: TypeKind::Int(_) | TypeKind::Poly, .. } => true, - _ => unimplemented!(), + _ => true, } } @@ -332,4 +349,13 @@ pub trait IntrinsicTypeDefinition: Deref { /// rust debug output format for the return type. The generated line assumes /// there is an int i in scope which is the current pass number. fn print_result_c(&self, indentation: Indentation, additional: &str) -> String; + + /// To enable architecture-specific logic + fn rust_scalar_type(&self) -> String { + format!( + "{prefix}{bits}", + prefix = self.kind().rust_prefix(), + bits = self.inner_size() + ) + } } diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index 666b3885c1..cb422c9cac 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -1,4 +1,5 @@ use std::fs::File; +use std::io::{self, Write}; use rayon::prelude::*; @@ -76,6 +77,14 @@ pub trait SupportedArchitectureTest { if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { let output = cpp_compiler .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?; + if !output.status.success() { + io::stdout() + .write_all(&output.stdout) + .expect("Failed to write to stdout!"); + io::stderr() + .write_all(&output.stderr) + .expect("Failed to write to stderr!"); + } assert!(output.status.success(), "{output:?}"); } diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index e73ceb5084..13ae627e66 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -7,7 +7,6 @@ mod xml_parser; use crate::common::SupportedArchitectureTest; use crate::common::cli::ProcessedCli; -use crate::common::compare::compare_outputs; use crate::common::compile_c::CppCompilation; use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; @@ -67,22 +66,4 @@ impl SupportedArchitectureTest for X86ArchitectureTest { cli_options: cli_options, } } - - fn compare_outputs(&self) -> bool { - if self.cli_options.toolchain.is_some() { - let intrinsics_name_list = self - .intrinsics - .iter() - .map(|i| i.name.clone()) - .collect::>(); - - compare_outputs( - &intrinsics_name_list, - &self.cli_options.runner, - &self.cli_options.target, - ) - } else { - true - } - } } diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index bb7ea59dbe..127dd38e6f 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -225,6 +225,20 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { ), } } + + fn rust_scalar_type(&self) -> String { + let re = Regex::new(r"\__m\d+[a-z]*").unwrap(); + if let Some(match_type) = re.find(self.param.type_data.as_str()) { + match_type.as_str().to_string() + } else { + let prefix = match self.data.kind { + TypeKind::Mask => String::from("__mmask"), + _ => self.kind().rust_prefix().to_string(), + }; + + format!("{prefix}{bits}", bits = self.inner_size()) + } + } } impl X86IntrinsicType { @@ -336,6 +350,12 @@ impl X86IntrinsicType { data.bit_len = Some(8); } + // default settings for "void *" parameters + // often used by intrinsics to denote memory address or so. + if data.kind == TypeKind::Mask && data.bit_len.is_none() { + data.bit_len = Some(32); + } + // if param.etype == IMM, then it is a constant. // else it stays unchanged. data.constant |= param.etype == "IMM"; From 2e05d598e9e2b7f14c5f1bd156eda7c740b096f5 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 14 Sep 2025 21:50:57 +0530 Subject: [PATCH 24/73] feat: correcting errors with generated C artifacts --- crates/intrinsic-test/src/arm/mod.rs | 12 +++- crates/intrinsic-test/src/common/argument.rs | 2 +- crates/intrinsic-test/src/common/gen_c.rs | 16 ++--- crates/intrinsic-test/src/common/mod.rs | 1 + crates/intrinsic-test/src/x86/compile.rs | 16 +---- crates/intrinsic-test/src/x86/config.rs | 63 +++++++------------- crates/intrinsic-test/src/x86/mod.rs | 12 +++- crates/intrinsic-test/src/x86/types.rs | 16 ++++- 8 files changed, 63 insertions(+), 75 deletions(-) diff --git a/crates/intrinsic-test/src/arm/mod.rs b/crates/intrinsic-test/src/arm/mod.rs index 08dc2d3870..a915d0d883 100644 --- a/crates/intrinsic-test/src/arm/mod.rs +++ b/crates/intrinsic-test/src/arm/mod.rs @@ -31,7 +31,17 @@ impl SupportedArchitectureTest for ArmArchitectureTest { const NOTICE: &str = config::NOTICE; - const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"]; + const PLATFORM_C_HEADERS: &[&str] = &[ + "iostream", + "cstring", + "iomanip", + "sstream", + "cstddef", + "cstdint", + "arm_neon.h", + "arm_acle.h", + "arm_fp16.h", + ]; const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF; const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL; diff --git a/crates/intrinsic-test/src/common/argument.rs b/crates/intrinsic-test/src/common/argument.rs index f38515e40a..871e3d2243 100644 --- a/crates/intrinsic-test/src/common/argument.rs +++ b/crates/intrinsic-test/src/common/argument.rs @@ -108,7 +108,7 @@ where for arg in self.iter().filter(|&arg| !arg.has_constraint()) { writeln!( w, - "{indentation}const {ty} {name}_vals[] = {values};", + "{indentation}alignas(64) const {ty} {name}_vals[] = {values};", ty = arg.ty.c_scalar_type(), name = arg.name, values = arg.ty.populate_random(indentation, loads, &Language::C) diff --git a/crates/intrinsic-test/src/common/gen_c.rs b/crates/intrinsic-test/src/common/gen_c.rs index 28902b3dfe..b7651dce59 100644 --- a/crates/intrinsic-test/src/common/gen_c.rs +++ b/crates/intrinsic-test/src/common/gen_c.rs @@ -47,7 +47,7 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>( let ty = current.ty.c_type(); writeln!(w, "{indentation}{{")?; - writeln!(w, "{body_indentation}{ty} {} = {i};", current.name)?; + writeln!(w, "{body_indentation}const {ty} {} = {i};", current.name)?; generate_c_constraint_blocks( w, @@ -103,14 +103,11 @@ pub fn write_mod_cpp( writeln!(w, "#include <{header}>")?; } + writeln!(w, "{}", forward_declarations)?; + writeln!( w, r#" -#include -#include -#include -#include - template T1 cast(T2 x) {{ static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); T1 ret{{}}; @@ -120,13 +117,9 @@ template T1 cast(T2 x) {{ std::ostream& operator<<(std::ostream& os, float16_t value); - - "# )?; - writeln!(w, "{}", forward_declarations)?; - for intrinsic in intrinsics { create_c_test_function(w, intrinsic)?; } @@ -137,12 +130,13 @@ std::ostream& operator<<(std::ostream& os, float16_t value); pub fn write_main_cpp<'a>( w: &mut impl std::io::Write, arch_specific_definitions: &str, + arch_specific_headers: &[&str], intrinsics: impl Iterator + Clone, ) -> std::io::Result<()> { writeln!(w, "#include ")?; writeln!(w, "#include ")?; - for header in ["arm_neon.h", "arm_acle.h", "arm_fp16.h"] { + for header in arch_specific_headers { writeln!(w, "#include <{header}>")?; } diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index cb422c9cac..5966bc2aec 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -97,6 +97,7 @@ pub trait SupportedArchitectureTest { write_main_cpp( &mut file, Self::PLATFORM_C_DEFINITIONS, + Self::PLATFORM_C_HEADERS, self.intrinsics().iter().map(|i| i.name.as_str()), ) .unwrap(); diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index 8baf581596..3e08a491a0 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -6,21 +6,7 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { // -ffp-contract=off emulates Rust's approach of not fusing separate mul-add operations let mut command = CompilationCommandBuilder::new() - .add_arch_flags([ - "avx", - "avx2", - "avx512f", - "avx512cd", - "avx512dq", - "avx512vl", - "avx512bw", - "avx512bf16", - "avx512bitalg", - "lzcnt", - "popcnt", - "adx", - "aes", - ]) + .add_arch_flags(["icelake-client"]) .set_compiler(cpp_compiler) .set_target(&config.target) .set_opt_level("2") diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index f1e9e9932e..e43fd33093 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -10,62 +10,41 @@ struct Hex(T); "#; pub const LANE_FUNCTION_HELPERS: &str = r#" -int mm512_extract(__m512i m, int vec_len, int bit_len, int index) { - int lane_len = 128; - int max_major_index = vec_len / lane_len; - int max_minor_index = lane_len / bit_len; +typedef float float16_t; +typedef float float32_t; +typedef double float64_t; - int major_index = index / max_major_index; - int minor_index = index % max_minor_index; +#define __int64 long long - __m128i lane = _mm512_extracti64x2_epi64(m, major_index); +#define _mm512_extract_intrinsic_test_epi8(m, lane) \ + _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) - switch(bit_len){ - case 8: - return _mm_extract_epi8(lane, minor_index); - case 16: - return _mm_extract_epi16(lane, minor_index); - case 32: - return _mm_extract_epi32(lane, minor_index); - case 64: - return _mm_extract_epi64(lane, minor_index); - } -} +#define _mm512_extract_intrinsic_test_epi16(m, lane) \ + _mm_extract_epi16(_mm512_extracti64x2_epi64((m), (lane) / 8), (lane) % 8) -int _mm512_extract_intrinsic_test_epi8(__m512i m, int lane) { - return mm512_extract(m, 512, 8, lane) -} +#define _mm512_extract_intrinsic_test_epi32(m, lane) \ + _mm_extract_epi32(_mm512_extracti64x2_epi64((m), (lane) / 4), (lane) % 4) -int _mm512_extract_intrinsic_test_epi16(__m512i m, int lane) { - return mm512_extract(m, 512, 16, lane) -} +#define _mm512_extract_intrinsic_test_epi64(m, lane) \ + _mm_extract_epi64(_mm512_extracti64x2_epi64((m), (lane) / 2), (lane) % 2) -int mm512_extract_intrinsic_test_epi16(__m512i m, int lane) { - return mm512_extract(m, 512, 16, lane) -} +#define _mm64_extract_intrinsic_test_epi8(m, lane) \ + ((_mm_extract_pi16((m), (lane) / 2) >> (((lane) % 2) * 8)) & 0xFF) -int mm512_extract_intrinsic_test_epi64(__m512i m, int lane) { - return mm512_extract(m, 512, 64, lane) -} - -int mm64_extract_intrinsic_test_epi8(__m64 m, int lane) { - int real_lane_shift = lane / 2; - int real_bit_shift = (lane % 2) * 8; - int result = _mm_extract_pi16(m, lane / 2); - return (result >> real_bit_shift); -} - -int mm64_extract_intrinsic_test_epi32(__m64 m, int lane) { - int bit_shift_amount = lane * 32; - return _m_to_int(m >> bit_shift_amount); -} +#define _mm64_extract_intrinsic_test_epi32(m, lane) \ + _mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32)) "#; pub const X86_CONFIGURATIONS: &str = r#" +#![cfg_attr(target_arch = "x86", feature(avx))] +#![cfg_attr(target_arch = "x86", feature(sse))] +#![cfg_attr(target_arch = "x86", feature(sse2))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_bf16))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] +#![cfg_attr(target_arch = "x86_64", feature(sse))] +#![cfg_attr(target_arch = "x86_64", feature(sse2))] #![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))] #![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))] #![feature(fmt_helpers_for_derive)] diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 13ae627e66..2ed3296169 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -35,9 +35,17 @@ impl SupportedArchitectureTest for X86ArchitectureTest { const NOTICE: &str = config::NOTICE; - const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h"]; + const PLATFORM_C_HEADERS: &[&str] = &[ + "immintrin.h", + "iostream", + "cstring", + "iomanip", + "sstream", + "cstddef", + "cstdint", + ]; const PLATFORM_C_DEFINITIONS: &str = config::LANE_FUNCTION_HELPERS; - const PLATFORM_C_FORWARD_DECLARATIONS: &str = ""; + const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::LANE_FUNCTION_HELPERS; const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF; const PLATFORM_RUST_CFGS: &str = config::X86_CONFIGURATIONS; diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 127dd38e6f..a1dc5623ca 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -110,7 +110,17 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { .filter(|c| c.is_numeric()) .join("") .replace("128", ""); - format!("_mm{type_val_filtered}_set1_epi64") + { + if type_value.ends_with("d") { + format!("_mm{type_val_filtered}_loadu_pd") + } else if type_value.ends_with("h") { + format!("_mm{type_val_filtered}_loadu_ph") + } else if type_value.ends_with("i") { + format!("_mm{type_val_filtered}_loadu_epi16") + } else { + format!("_mm{type_val_filtered}_loadu_ps") + } + } } else { // if it is a pointer, then rely on type conversion // If it is not any of the above type (__int, __bfloat16, unsigned short, etc) @@ -216,9 +226,9 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { (Some(16), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi16"), (Some(32), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi32"), (Some(64), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi64"), - (Some(8), Some(64)) => String::from("mm64_extract_intrinsic_test_epi8"), + (Some(8), Some(64)) => String::from("_mm64_extract_intrinsic_test_epi8"), (Some(16), Some(64)) => String::from("_mm_extract_pi16"), - (Some(32), Some(64)) => String::from("mm64_extract_intrinsic_test_epi32"), + (Some(32), Some(64)) => String::from("_mm64_extract_intrinsic_test_epi32"), _ => unreachable!( "invalid length for vector argument: {:?}, {:?}", self.bit_len, self.vec_len From e977009da425950c81a1c8cd8846f8213cfdc2b5 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 15 Sep 2025 01:03:48 +0530 Subject: [PATCH 25/73] fix: vec_len -> simd_len (an error was present due to setting vec_len instead of simd_len for AVX register types) --- crates/intrinsic-test/src/x86/config.rs | 1 + crates/intrinsic-test/src/x86/types.rs | 12 ++++++------ crates/intrinsic-test/src/x86/xml_parser.rs | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index e43fd33093..58fabcbd0e 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -15,6 +15,7 @@ typedef float float32_t; typedef double float64_t; #define __int64 long long +#define __int32 int #define _mm512_extract_intrinsic_test_epi8(m, lane) \ _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index a1dc5623ca..99c52551ad 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -209,9 +209,9 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { /// Determines the get lane function for this type. fn get_lane_function(&self) -> String { let total_vector_bits: Option = self - .vec_len + .simd_len .zip(self.bit_len) - .and_then(|(vec_len, bit_len)| Some(vec_len * bit_len)); + .and_then(|(simd_len, bit_len)| Some(simd_len * bit_len)); match (self.bit_len, total_vector_bits) { (Some(8), Some(128)) => String::from("_mm_extract_epi8"), @@ -231,7 +231,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { (Some(32), Some(64)) => String::from("_mm64_extract_intrinsic_test_epi32"), _ => unreachable!( "invalid length for vector argument: {:?}, {:?}", - self.bit_len, self.vec_len + self.bit_len, self.simd_len ), } } @@ -345,9 +345,9 @@ impl X86IntrinsicType { if param.type_data.matches("__m").next().is_some() && param.type_data.matches("__mmask").next().is_none() { - data.vec_len = match str::parse::(type_processed.as_str()) { - // If bit_len is None, vec_len will be None. - // Else vec_len will be (num_bits / bit_len). + data.simd_len = match str::parse::(type_processed.as_str()) { + // If bit_len is None, simd_len will be None. + // Else simd_len will be (num_bits / bit_len). Ok(num_bits) => data.bit_len.and_then(|bit_len| Some(num_bits / bit_len)), Err(_) => None, }; diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 7465cb72d5..808f594a8c 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -106,7 +106,7 @@ fn xml_to_intrinsic( let mut args_test = args.iter(); // if one of the args has etype="MASK" and type="__md", - // then set the bit_len and vec_len accordingly + // then set the bit_len and simd_len accordingly let re = Regex::new(r"__m\d+").unwrap(); let is_mask = |arg: &Argument| arg.ty.param.etype.as_str() == "MASK"; let is_vector = |arg: &Argument| re.is_match(arg.ty.param.type_data.as_str()); From c1d971069c188aafed03b2dde123002a4b8021a9 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 16 Sep 2025 17:08:35 +0530 Subject: [PATCH 26/73] chore: revert default target --- crates/intrinsic-test/src/common/cli.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/intrinsic-test/src/common/cli.rs b/crates/intrinsic-test/src/common/cli.rs index f8125128ea..beae6a4b04 100644 --- a/crates/intrinsic-test/src/common/cli.rs +++ b/crates/intrinsic-test/src/common/cli.rs @@ -44,9 +44,7 @@ pub struct Cli { pub generate_only: bool, /// Pass a target the test suite - /// x86_64-unknown-linux-gnu - /// armv7-unknown-linux-gnueabihf - #[arg(long, default_value_t = String::from("x86_64-unknown-linux-gnu"))] + #[arg(long, default_value_t = String::from("armv7-unknown-linux-gnueabihf"))] pub target: String, /// Set the linker From 91eb33115bfe41231bc5c9d4dd34bed0f0f9b008 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 17 Sep 2025 23:58:20 +0530 Subject: [PATCH 27/73] chore: adding comments about memory alignment of variables and bash scripts that will be used in CI --- crates/intrinsic-test/src/arm/mod.rs | 2 -- crates/intrinsic-test/src/common/argument.rs | 2 ++ crates/intrinsic-test/src/common/gen_rust.rs | 6 +++--- crates/intrinsic-test/src/common/mod.rs | 9 --------- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/crates/intrinsic-test/src/arm/mod.rs b/crates/intrinsic-test/src/arm/mod.rs index a915d0d883..8f8289a7ec 100644 --- a/crates/intrinsic-test/src/arm/mod.rs +++ b/crates/intrinsic-test/src/arm/mod.rs @@ -36,8 +36,6 @@ impl SupportedArchitectureTest for ArmArchitectureTest { "cstring", "iomanip", "sstream", - "cstddef", - "cstdint", "arm_neon.h", "arm_acle.h", "arm_fp16.h", diff --git a/crates/intrinsic-test/src/common/argument.rs b/crates/intrinsic-test/src/common/argument.rs index 871e3d2243..0ab01e4144 100644 --- a/crates/intrinsic-test/src/common/argument.rs +++ b/crates/intrinsic-test/src/common/argument.rs @@ -106,6 +106,8 @@ where loads: u32, ) -> std::io::Result<()> { for arg in self.iter().filter(|&arg| !arg.has_constraint()) { + // Setting the variables on an aligned boundary to make it easier to pick + // functions (of a specific architecture) that would help load the values. writeln!( w, "{indentation}alignas(64) const {ty} {name}_vals[] = {values};", diff --git a/crates/intrinsic-test/src/common/gen_rust.rs b/crates/intrinsic-test/src/common/gen_rust.rs index e97b745c59..3b330879e0 100644 --- a/crates/intrinsic-test/src/common/gen_rust.rs +++ b/crates/intrinsic-test/src/common/gen_rust.rs @@ -191,7 +191,7 @@ pub fn generate_rust_test_loop( w: &mut impl std::io::Write, intrinsic: &Intrinsic, indentation: Indentation, - specializations: &[Vec], + specializations: &[Vec], passes: u32, ) -> std::io::Result<()> { let intrinsic_name = &intrinsic.name; @@ -256,7 +256,7 @@ pub fn generate_rust_test_loop( /// Generate the specializations (unique sequences of const-generic arguments) for this intrinsic. fn generate_rust_specializations( constraints: &mut impl Iterator>, -) -> Vec> { +) -> Vec> { let mut specializations = vec![vec![]]; for constraint in constraints { @@ -264,7 +264,7 @@ fn generate_rust_specializations( .flat_map(|right| { specializations.iter().map(move |left| { let mut left = left.clone(); - left.push(i64::try_from(right).unwrap()); + left.push(i32::try_from(right).unwrap()); left }) }) diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index 5966bc2aec..da9c75f5a0 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -1,5 +1,4 @@ use std::fs::File; -use std::io::{self, Write}; use rayon::prelude::*; @@ -77,14 +76,6 @@ pub trait SupportedArchitectureTest { if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { let output = cpp_compiler .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?; - if !output.status.success() { - io::stdout() - .write_all(&output.stdout) - .expect("Failed to write to stdout!"); - io::stderr() - .write_all(&output.stderr) - .expect("Failed to write to stderr!"); - } assert!(output.status.success(), "{output:?}"); } From 1cf7c54a09f38375edf80cf93dfcb729931e89b8 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 18 Sep 2025 01:39:51 +0530 Subject: [PATCH 28/73] chore: add compilation flags --- crates/intrinsic-test/src/x86/compile.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index 3e08a491a0..9f3a76c4c1 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -12,7 +12,14 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { .set_opt_level("2") .set_cxx_toolchain_dir(config.cxx_toolchain_dir.as_deref()) .set_project_root("c_programs") - .add_extra_flags(vec!["-ffp-contract=off", "-Wno-narrowing"]); + .add_extra_flags(vec![ + "-ffp-contract=off", + "-Wno-narrowing", + "-mavx", + "-mavx2", + "-mavx512f", + "-msse2", + ]); if !cpp_compiler.contains("clang") { command = command.add_extra_flag("-flax-vector-conversions"); From 5b6f2f13292f8d579639171b97f35d658f3b2775 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 18 Sep 2025 13:41:46 +0530 Subject: [PATCH 29/73] chore: add better error handling when writing and compiling mod_{i}.cpp, neatly organize C++ headers --- crates/intrinsic-test/src/arm/mod.rs | 10 +--------- crates/intrinsic-test/src/common/gen_c.rs | 12 +++--------- crates/intrinsic-test/src/common/mod.rs | 20 +++++++++++++------- crates/intrinsic-test/src/x86/mod.rs | 10 +--------- 4 files changed, 18 insertions(+), 34 deletions(-) diff --git a/crates/intrinsic-test/src/arm/mod.rs b/crates/intrinsic-test/src/arm/mod.rs index 8f8289a7ec..08dc2d3870 100644 --- a/crates/intrinsic-test/src/arm/mod.rs +++ b/crates/intrinsic-test/src/arm/mod.rs @@ -31,15 +31,7 @@ impl SupportedArchitectureTest for ArmArchitectureTest { const NOTICE: &str = config::NOTICE; - const PLATFORM_C_HEADERS: &[&str] = &[ - "iostream", - "cstring", - "iomanip", - "sstream", - "arm_neon.h", - "arm_acle.h", - "arm_fp16.h", - ]; + const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"]; const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF; const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL; diff --git a/crates/intrinsic-test/src/common/gen_c.rs b/crates/intrinsic-test/src/common/gen_c.rs index b7651dce59..25e4e210c3 100644 --- a/crates/intrinsic-test/src/common/gen_c.rs +++ b/crates/intrinsic-test/src/common/gen_c.rs @@ -6,6 +6,7 @@ use super::intrinsic_helpers::IntrinsicTypeDefinition; // The number of times each intrinsic will be called. const PASSES: u32 = 20; +const COMMON_HEADERS: [&str; 5] = ["iostream", "string", "cstring", "iomanip", "sstream"]; pub fn generate_c_test_loop( w: &mut impl std::io::Write, @@ -99,7 +100,7 @@ pub fn write_mod_cpp( ) -> std::io::Result<()> { write!(w, "{notice}")?; - for header in platform_headers { + for header in COMMON_HEADERS.iter().chain(platform_headers.iter()) { writeln!(w, "#include <{header}>")?; } @@ -133,20 +134,13 @@ pub fn write_main_cpp<'a>( arch_specific_headers: &[&str], intrinsics: impl Iterator + Clone, ) -> std::io::Result<()> { - writeln!(w, "#include ")?; - writeln!(w, "#include ")?; - - for header in arch_specific_headers { + for header in COMMON_HEADERS.iter().chain(arch_specific_headers.iter()) { writeln!(w, "#include <{header}>")?; } writeln!( w, r#" -#include -#include -#include - std::ostream& operator<<(std::ostream& os, float16_t value) {{ uint16_t temp = 0; memcpy(&temp, &value, sizeof(float16_t)); diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index da9c75f5a0..37a48654e4 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -60,28 +60,34 @@ pub trait SupportedArchitectureTest { .map(|(i, chunk)| { let c_filename = format!("c_programs/mod_{i}.cpp"); let mut file = File::create(&c_filename).unwrap(); - write_mod_cpp( + let mod_file_write_result = write_mod_cpp( &mut file, Self::NOTICE, Self::PLATFORM_C_HEADERS, Self::PLATFORM_C_FORWARD_DECLARATIONS, chunk, - ) - .unwrap(); + ); + + if let Err(error) = mod_file_write_result { + return Err(format!("Error writing to mod_{i}.cpp: {error:?}")); + } // compile this cpp file into a .o file. // // This is done because `cpp_compiler_wrapped` is None when // the --generate-only flag is passed if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { - let output = cpp_compiler - .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o"))?; - assert!(output.status.success(), "{output:?}"); + let compile_output = cpp_compiler + .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o")); + + if let Err(compile_error) = compile_output { + return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}")); + } } Ok(()) }) - .collect::>() + .collect::>() .unwrap(); let mut file = File::create("c_programs/main.cpp").unwrap(); diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 2ed3296169..e4c9742f8d 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -35,15 +35,7 @@ impl SupportedArchitectureTest for X86ArchitectureTest { const NOTICE: &str = config::NOTICE; - const PLATFORM_C_HEADERS: &[&str] = &[ - "immintrin.h", - "iostream", - "cstring", - "iomanip", - "sstream", - "cstddef", - "cstdint", - ]; + const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h", "cstddef", "cstdint"]; const PLATFORM_C_DEFINITIONS: &str = config::LANE_FUNCTION_HELPERS; const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::LANE_FUNCTION_HELPERS; From d9ff321fd4119c7ab5fbfc6fdb35f51de95936f6 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 20 Sep 2025 15:14:43 +0530 Subject: [PATCH 30/73] feat: Fixed FP16 errors, made the loading function generation more accurate --- crates/intrinsic-test/src/arm/config.rs | 10 ++++ crates/intrinsic-test/src/common/argument.rs | 14 ++++-- crates/intrinsic-test/src/common/gen_c.rs | 23 ++-------- .../src/common/intrinsic_helpers.rs | 5 +- crates/intrinsic-test/src/common/values.rs | 25 ++++++++++ crates/intrinsic-test/src/x86/compile.rs | 6 +++ crates/intrinsic-test/src/x86/config.rs | 13 +++++- crates/intrinsic-test/src/x86/types.rs | 46 ++++++++++++++----- crates/intrinsic-test/src/x86/xml_parser.rs | 2 + 9 files changed, 107 insertions(+), 37 deletions(-) diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index d9024eabfa..46706e009e 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -6,6 +6,7 @@ pub const NOTICE: &str = "\ pub const POLY128_OSTREAM_DECL: &str = r#" #ifdef __aarch64__ std::ostream& operator<<(std::ostream& os, poly128_t value); +std::ostream& operator<<(std::ostream& os, float16_t value); #endif "#; @@ -23,6 +24,15 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) { os << res; return os; } + +std::ostream& operator<<(std::ostream& os, float16_t value) { + uint16_t temp = 0; + memcpy(&temp, &value, sizeof(float16_t)); + std::stringstream ss; + ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp; + os << ss.str(); + return os; +} #endif "#; diff --git a/crates/intrinsic-test/src/common/argument.rs b/crates/intrinsic-test/src/common/argument.rs index 0ab01e4144..986c383ee1 100644 --- a/crates/intrinsic-test/src/common/argument.rs +++ b/crates/intrinsic-test/src/common/argument.rs @@ -33,6 +33,10 @@ where self.ty.c_type() } + pub fn generate_name(&self) -> String { + format!("{}_val", self.name) + } + pub fn is_simd(&self) -> bool { self.ty.is_simd() } @@ -64,7 +68,7 @@ where } fn as_call_param_c(&self) -> String { - self.ty.as_call_param_c(&self.name) + self.ty.as_call_param_c(&self.generate_name()) } } @@ -91,7 +95,7 @@ where pub fn as_call_param_rust(&self) -> String { self.iter() .filter(|a| !a.has_constraint()) - .map(|arg| arg.name.clone()) + .map(|arg| arg.generate_name()) .collect::>() .join(", ") } @@ -112,7 +116,7 @@ where w, "{indentation}alignas(64) const {ty} {name}_vals[] = {values};", ty = arg.ty.c_scalar_type(), - name = arg.name, + name = arg.generate_name(), values = arg.ty.populate_random(indentation, loads, &Language::C) )? } @@ -155,7 +159,7 @@ where format!( "{indentation}{ty} {name} = cast<{ty}>({load}(&{name}_vals[i]));\n", ty = arg.to_c_type(), - name = arg.name, + name = arg.generate_name(), load = if arg.is_simd() { arg.ty.get_load_function(Language::C) } else { @@ -175,7 +179,7 @@ where .map(|arg| { format!( "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i));\n", - name = arg.name, + name = arg.generate_name(), vals_name = arg.rust_vals_array_name(), load = if arg.is_simd() { arg.ty.get_load_function(Language::Rust) diff --git a/crates/intrinsic-test/src/common/gen_c.rs b/crates/intrinsic-test/src/common/gen_c.rs index 25e4e210c3..aeb94176f5 100644 --- a/crates/intrinsic-test/src/common/gen_c.rs +++ b/crates/intrinsic-test/src/common/gen_c.rs @@ -48,7 +48,11 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>( let ty = current.ty.c_type(); writeln!(w, "{indentation}{{")?; - writeln!(w, "{body_indentation}const {ty} {} = {i};", current.name)?; + writeln!( + w, + "{body_indentation}const {ty} {} = {i};", + current.generate_name() + )?; generate_c_constraint_blocks( w, @@ -115,9 +119,6 @@ template T1 cast(T2 x) {{ memcpy(&ret, &x, sizeof(T1)); return ret; }} - -std::ostream& operator<<(std::ostream& os, float16_t value); - "# )?; @@ -138,20 +139,6 @@ pub fn write_main_cpp<'a>( writeln!(w, "#include <{header}>")?; } - writeln!( - w, - r#" -std::ostream& operator<<(std::ostream& os, float16_t value) {{ - uint16_t temp = 0; - memcpy(&temp, &value, sizeof(float16_t)); - std::stringstream ss; - ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp; - os << ss.str(); - return os; -}} -"# - )?; - // NOTE: It's assumed that this value contains the required `ifdef`s. writeln!(w, "{arch_specific_definitions }")?; diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 7403b81df8..5d930eea2f 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::fmt; use std::ops::Deref; use std::str::FromStr; @@ -131,7 +132,7 @@ impl IntrinsicType { pub fn inner_size(&self) -> u32 { if let Some(bl) = self.bit_len { - bl + cmp::max(bl, 8) } else { unreachable!("{:#?}", self) } @@ -216,7 +217,7 @@ impl IntrinsicType { ) -> String { match self { IntrinsicType { - bit_len: Some(bit_len @ (8 | 16 | 32 | 64)), + bit_len: Some(bit_len @ (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 16 | 32 | 64)), kind: kind @ (TypeKind::Int(_) | TypeKind::Poly | TypeKind::Char(_) | TypeKind::Mask), simd_len, diff --git a/crates/intrinsic-test/src/common/values.rs b/crates/intrinsic-test/src/common/values.rs index 1b614a742e..6c94ef2c22 100644 --- a/crates/intrinsic-test/src/common/values.rs +++ b/crates/intrinsic-test/src/common/values.rs @@ -4,6 +4,13 @@ pub fn value_for_array(bits: u32, index: u32) -> u64 { let index = index as usize; match bits { + 1 => VALUES_8[index % 2].into(), + 2 => VALUES_8[index % 4].into(), + 3 => VALUES_8[index % 8].into(), + 4 => VALUES_8[index % 16].into(), + 5 => VALUES_5[index % VALUES_5.len()].into(), + 6 => VALUES_6[index % VALUES_6.len()].into(), + 7 => VALUES_7[index % VALUES_7.len()].into(), 8 => VALUES_8[index % VALUES_8.len()].into(), 16 => VALUES_16[index % VALUES_16.len()].into(), 32 => VALUES_32[index % VALUES_32.len()].into(), @@ -12,6 +19,24 @@ pub fn value_for_array(bits: u32, index: u32) -> u64 { } } +pub const VALUES_5: &[u8] = &[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x019, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, + 0x1f, +]; + +pub const VALUES_6: &[u8] = &[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x039, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, + 0x3f, +]; + +pub const VALUES_7: &[u8] = &[ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x079, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, + 0x7f, +]; + pub const VALUES_8: &[u8] = &[ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xf0, 0x80, 0x3b, 0xff, diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index 9f3a76c4c1..6eaab86150 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -19,6 +19,12 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { "-mavx2", "-mavx512f", "-msse2", + "-mavx512vl", + "-mavx512bw", + "-mavx512dq", + "-mavx512cd", + "-mavx512fp16", + "-ferror-limit=1000", ]); if !cpp_compiler.contains("clang") { diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 58fabcbd0e..32a9b586c0 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -10,13 +10,24 @@ struct Hex(T); "#; pub const LANE_FUNCTION_HELPERS: &str = r#" -typedef float float16_t; +typedef _Float16 float16_t; typedef float float32_t; typedef double float64_t; #define __int64 long long #define __int32 int +std::ostream& operator<<(std::ostream& os, _Float16 value); + +std::ostream& operator<<(std::ostream& os, _Float16 value) { + uint16_t temp = 0; + memcpy(&temp, &value, sizeof(_Float16)); + std::stringstream ss; + ss << "0x" << std::setfill('0') << std::setw(4) << std::hex << temp; + os << ss.str(); + return os; +} + #define _mm512_extract_intrinsic_test_epi8(m, lane) \ _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 99c52551ad..dfaf2adaf4 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -13,7 +13,17 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { /// Gets a string containing the type in C format. /// This function assumes that this value is present in the metadata hashmap. fn c_type(&self) -> String { - self.param.type_data.clone() + self.param + .type_data + .replace("unsigned __int64", "uint64_t") + .replace("unsigned __int32", "uint32_t") + .replace("unsigned __int16", "uint16_t") + .replace("unsigned __int8", "uint8_t") + .replace("__int64", "int64_t") + .replace("__int32", "int32_t") + .replace("__int16", "int16_t") + .replace("__int8", "int8_t") + .replace("const ", "") } fn c_single_vector_type(&self) -> String { @@ -109,17 +119,22 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { .chars() .filter(|c| c.is_numeric()) .join("") - .replace("128", ""); + .replace("128", "") + .replace("64", ""); { - if type_value.ends_with("d") { - format!("_mm{type_val_filtered}_loadu_pd") - } else if type_value.ends_with("h") { - format!("_mm{type_val_filtered}_loadu_ph") - } else if type_value.ends_with("i") { - format!("_mm{type_val_filtered}_loadu_epi16") - } else { - format!("_mm{type_val_filtered}_loadu_ps") - } + let suffix = match (self.bit_len, self.kind) { + (Some(bit_len @ (8 | 16 | 32 | 64)), TypeKind::Int(_)) => { + format!("epi{bit_len}") + } + (Some(16), TypeKind::Float) => format!("ph"), + (Some(32), TypeKind::Float) => format!("ps"), + (Some(64), TypeKind::Float) => format!("pd"), + (Some(128), TypeKind::Vector) => format!("si128"), + (Some(256), TypeKind::Vector) => format!("si256"), + (Some(512), TypeKind::Vector) => format!("si512"), + _ => unreachable!("Invalid element type for a vector type! {:?}", self.param), + }; + format!("_mm{type_val_filtered}_loadu_{suffix}") } } else { // if it is a pointer, then rely on type conversion @@ -366,6 +381,15 @@ impl X86IntrinsicType { data.bit_len = Some(32); } + // default settings for IMM parameters + if param.etype == "IMM" && param.imm_width > 0 { + data.bit_len = Some(param.imm_width); + } + + if param.etype == "IMM" || param.imm_width > 0 || param.imm_type.len() > 0 { + data.constant = true; + } + // if param.etype == IMM, then it is a constant. // else it stays unchanged. data.constant |= param.etype == "IMM"; diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 808f594a8c..157a37fc9d 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -48,6 +48,8 @@ pub struct Parameter { pub etype: String, #[serde(rename = "@memwidth", default, deserialize_with = "string_to_u32")] pub memwidth: u32, + #[serde(rename = "@immwidth", default, deserialize_with = "string_to_u32")] + pub imm_width: u32, #[serde(rename = "@immtype", default)] pub imm_type: String, } From 48c6ed13a65799975897c90b4deb3a47e26b8498 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:07:45 +0530 Subject: [PATCH 31/73] chore: Ensuring "const" appears for constant arguments to intrinsics. Extra changes: 1. Using "as _" to allow for implicit typecasting --- crates/intrinsic-test/src/common/argument.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/intrinsic-test/src/common/argument.rs b/crates/intrinsic-test/src/common/argument.rs index 986c383ee1..5963abef2f 100644 --- a/crates/intrinsic-test/src/common/argument.rs +++ b/crates/intrinsic-test/src/common/argument.rs @@ -30,7 +30,8 @@ where } pub fn to_c_type(&self) -> String { - self.ty.c_type() + let prefix = if self.ty.constant { "const " } else { "" }; + format!("{}{}", prefix, self.ty.c_type()) } pub fn generate_name(&self) -> String { @@ -95,7 +96,7 @@ where pub fn as_call_param_rust(&self) -> String { self.iter() .filter(|a| !a.has_constraint()) - .map(|arg| arg.generate_name()) + .map(|arg| arg.generate_name() + " as _") .collect::>() .join(", ") } @@ -177,15 +178,16 @@ where self.iter() .filter(|&arg| !arg.has_constraint()) .map(|arg| { + let load = if arg.is_simd() { + arg.ty.get_load_function(Language::Rust) + } else { + "*".to_string() + }; + let typecast = if load.len() > 2 { "as _" } else { "" }; format!( - "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i));\n", + "{indentation}let {name} = {load}({vals_name}.as_ptr().offset(i){typecast});\n", name = arg.generate_name(), vals_name = arg.rust_vals_array_name(), - load = if arg.is_simd() { - arg.ty.get_load_function(Language::Rust) - } else { - "*".to_string() - }, ) }) .collect() From 45d097f5a463fc4aa58a6f819be9c382607532b9 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:10:34 +0530 Subject: [PATCH 32/73] chore: allowing cast() function to allow implicity type conversion for certain cases (like uint32_t to uint64_t) extras: 1. added more C++ headers 2. typecasting integer constants (for example, the MM_FROUND arguments) for type compatibility --- crates/intrinsic-test/src/common/gen_c.rs | 30 ++++++++++++++++++----- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/crates/intrinsic-test/src/common/gen_c.rs b/crates/intrinsic-test/src/common/gen_c.rs index aeb94176f5..965e229da5 100644 --- a/crates/intrinsic-test/src/common/gen_c.rs +++ b/crates/intrinsic-test/src/common/gen_c.rs @@ -6,7 +6,15 @@ use super::intrinsic_helpers::IntrinsicTypeDefinition; // The number of times each intrinsic will be called. const PASSES: u32 = 20; -const COMMON_HEADERS: [&str; 5] = ["iostream", "string", "cstring", "iomanip", "sstream"]; +const COMMON_HEADERS: [&str; 7] = [ + "iostream", + "string", + "cstring", + "iomanip", + "sstream", + "type_traits", + "cassert", +]; pub fn generate_c_test_loop( w: &mut impl std::io::Write, @@ -48,9 +56,13 @@ pub fn generate_c_constraint_blocks<'a, T: IntrinsicTypeDefinition + 'a>( let ty = current.ty.c_type(); writeln!(w, "{indentation}{{")?; + + // TODO: Move to actually specifying the enum value + // instead of typecasting integers, for better clarity + // of generated code. writeln!( w, - "{body_indentation}const {ty} {} = {i};", + "{body_indentation}const {ty} {} = ({ty}){i};", current.generate_name() )?; @@ -113,11 +125,17 @@ pub fn write_mod_cpp( writeln!( w, r#" +// T1 is the `To` type, T2 is the `From` type template T1 cast(T2 x) {{ - static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); - T1 ret{{}}; - memcpy(&ret, &x, sizeof(T1)); - return ret; + if (std::is_convertible::value) {{ + return x; + }} else if (sizeof(T1) == sizeof(T2)) {{ + T1 ret{{}}; + memcpy(&ret, &x, sizeof(T1)); + return ret; + }} else {{ + assert("T2 must either be convertable to T1, or have the same size as T1!"); + }} }} "# )?; From 5553fdd3602b67c4c92670f07b9b2d4e4cf1fc95 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:13:10 +0530 Subject: [PATCH 33/73] feat: matching the expected number of elements for array to load arguments, accommodating for signed variables too --- .../src/common/intrinsic_helpers.rs | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 5d930eea2f..43a0e3f5d1 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -284,6 +284,7 @@ impl IntrinsicType { kind: TypeKind::Vector, bit_len: Some(bit_len @ (128 | 256 | 512)), simd_len, + vec_len, .. } => { let (prefix, suffix) = match language { @@ -292,14 +293,27 @@ impl IntrinsicType { }; let body_indentation = indentation.nested(); let effective_bit_len = 32; - let effective_vec_len = bit_len / effective_bit_len; format!( "{prefix}\n{body}\n{indentation}{suffix}", - body = (0..(simd_len.unwrap_or(1) * effective_vec_len + loads - 1)) + body = (0..(vec_len.unwrap_or(1) * simd_len.unwrap_or(1) + loads - 1)) .format_with(",\n", |i, fmt| { let src = value_for_array(effective_bit_len, i); - assert!(src == 0 || src.ilog2() < *bit_len); - fmt(&format_args!("{body_indentation}{src:#x}")) + assert!(src == 0 || src.ilog2() < effective_bit_len); + if (src >> (effective_bit_len - 1)) != 0 { + // `src` is a two's complement representation of a negative value. + let mask = !0u64 >> (64 - effective_bit_len); + let ones_compl = src ^ mask; + let twos_compl = ones_compl + 1; + if (twos_compl == src) && (language == &Language::C) { + // `src` is INT*_MIN. C requires `-0x7fffffff - 1` to avoid + // undefined literal overflow behaviour. + fmt(&format_args!("{body_indentation}-{ones_compl:#x} - 1")) + } else { + fmt(&format_args!("{body_indentation}-{twos_compl:#x}")) + } + } else { + fmt(&format_args!("{body_indentation}{src:#x}")) + } }) ) } From 39a0e45ffb784b8398a3907ceae39204ce4bdca0 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:14:20 +0530 Subject: [PATCH 34/73] feat: updated with debug printing and ostream implementation for vector types --- crates/intrinsic-test/src/x86/config.rs | 100 +++++++++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 32a9b586c0..a199a58ff2 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -5,8 +5,64 @@ pub const NOTICE: &str = "\ // Format f16 values (and vectors containing them) in a way that is consistent with C. pub const F16_FORMATTING_DEF: &str = r#" +use std::arch::x86_64::*; + +#[inline] +fn debug_simd_finish( + formatter: &mut core::fmt::Formatter<'_>, + type_name: &str, + array: &[T; N], +) -> core::fmt::Result { + core::fmt::Formatter::debug_tuple_fields_finish( + formatter, + type_name, + &core::array::from_fn::<&dyn core::fmt::Debug, N, _>(|i| &array[i]), + ) +} + #[repr(transparent)] struct Hex(T); + +impl core::fmt::Debug for Hex { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + ::fmt(&self.0, f) + } +} + +fn debug_f16(x: T) -> impl core::fmt::Debug { + Hex(x) +} + +trait DebugHexF16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result; +} + +impl DebugHexF16 for f16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{:#06x?}", self.to_bits()) + } +} + +impl DebugHexF16 for __m128h { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 8]>(*self) }; + debug_simd_finish(f, "__m128h", &array) + } +} + +impl DebugHexF16 for __m256h { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 16]>(*self) }; + debug_simd_finish(f, "__m256h", &array) + } +} + +impl DebugHexF16 for __m512h { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 32]>(*self) }; + debug_simd_finish(f, "__m512h", &array) + } +} "#; pub const LANE_FUNCTION_HELPERS: &str = r#" @@ -18,6 +74,9 @@ typedef double float64_t; #define __int32 int std::ostream& operator<<(std::ostream& os, _Float16 value); +std::ostream& operator<<(std::ostream& os, __m128i value); +std::ostream& operator<<(std::ostream& os, __m256i value); +std::ostream& operator<<(std::ostream& os, __m512i value); std::ostream& operator<<(std::ostream& os, _Float16 value) { uint16_t temp = 0; @@ -28,6 +87,45 @@ std::ostream& operator<<(std::ostream& os, _Float16 value) { return os; } +std::ostream& operator<<(std::ostream& os, __m128i value) { + void* temp = malloc(sizeof(__m128i)); + _mm_storeu_si128((__m128i*)temp, value); + std::stringstream ss; + + ss << "0x"; + for(int i = 0; i < 16; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i]; + } + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, __m256i value) { + void* temp = malloc(sizeof(__m256i)); + _mm256_storeu_si256((__m256i*)temp, value); + std::stringstream ss; + + ss << "0x"; + for(int i = 0; i < 32; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i]; + } + os << ss.str(); + return os; +} + +std::ostream& operator<<(std::ostream& os, __m512i value) { + void* temp = malloc(sizeof(__m512i)); + _mm512_storeu_si512((__m512i*)temp, value); + std::stringstream ss; + + ss << "0x"; + for(int i = 0; i < 64; i++) { + ss << std::setfill('0') << std::setw(2) << std::hex << ((char*)temp)[i]; + } + os << ss.str(); + return os; +} + #define _mm512_extract_intrinsic_test_epi8(m, lane) \ _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) @@ -55,8 +153,6 @@ pub const X86_CONFIGURATIONS: &str = r#" #![cfg_attr(target_arch = "x86", feature(stdarch_x86_avx512_f16))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] #![cfg_attr(target_arch = "x86", feature(stdarch_x86_rtm))] -#![cfg_attr(target_arch = "x86_64", feature(sse))] -#![cfg_attr(target_arch = "x86_64", feature(sse2))] #![cfg_attr(target_arch = "x86_64", feature(x86_amx_intrinsics))] #![cfg_attr(target_arch = "x86_64", feature(stdarch_x86_avx512_f16))] #![feature(fmt_helpers_for_derive)] From 2742d33180cbc3a484ac409a030a668ae4724436 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:15:39 +0530 Subject: [PATCH 35/73] chore: corrected the legal range of values for constrained arguments such as _MM_FROUND_SAE and _MM_ROUND_MODE --- crates/intrinsic-test/src/x86/constraint.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/crates/intrinsic-test/src/x86/constraint.rs b/crates/intrinsic-test/src/x86/constraint.rs index 1f06988388..72f5da3b3f 100644 --- a/crates/intrinsic-test/src/x86/constraint.rs +++ b/crates/intrinsic-test/src/x86/constraint.rs @@ -1,19 +1,29 @@ use crate::common::constraint::Constraint; -pub fn map_constraints(imm_type: &String) -> Option { +pub fn map_constraints(imm_type: &String, imm_width: u32) -> Option { + if imm_width > 0 { + let max: i64 = 2i64.pow(imm_width); + return Some(Constraint::Range(0..max)); + } match imm_type.as_str() { - "_MM_FROUND" => Some(Constraint::Range(0..4)), + // Legal values for variables of `_MM_FROUND` type are: + // 8 => (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions + // 9 => (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions + // 10 => (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions + // 11 => (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions + // 4 => _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE + "_MM_FROUND" => Some(Constraint::Set(vec![4, 8, 9, 10, 11])), "_MM_INDEX_SCALE" => Some(Constraint::Set(vec![1, 2, 4, 8])), "_MM_CMPINT" => Some(Constraint::Range(0..8)), "_MM_REDUCE" => Some(Constraint::Range(0..8)), - "_MM_FROUND_SAE" => Some(Constraint::Range(0..8)), + "_MM_FROUND_SAE" => Some(Constraint::Equal(8)), "_MM_MANTISSA_NORM" => Some(Constraint::Range(0..4)), "_MM_MANTISSA_NORM_ENUM" => Some(Constraint::Range(0..4)), "_MM_MANTISSA_SIGN" => Some(Constraint::Range(0..3)), "_MM_PERM" => Some(Constraint::Range(0..256)), "_MM_PERM_ENUM" => Some(Constraint::Range(0..256)), "_MM_CMPINT_ENUM" => Some(Constraint::Range(0..8)), - "_MM_ROUND_MODE" => Some(Constraint::Set(vec![0, 0x2000, 0x4000, 0x6000])), + "_MM_ROUND_MODE" => Some(Constraint::Set(vec![0, 0x2, 0x4, 0x6])), "_CMP_" => Some(Constraint::Range(0..32)), _ => None, } From 18caf69df05ac49612f6b1fa7d8104b5bb631ed3 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:16:23 +0530 Subject: [PATCH 36/73] feat: filter for duplicates in the definition of intrinsics --- crates/intrinsic-test/src/x86/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index e4c9742f8d..d5ebd960b3 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -11,6 +11,7 @@ use crate::common::compile_c::CppCompilation; use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; use intrinsic::X86IntrinsicType; +use itertools::Itertools; use xml_parser::get_xml_intrinsics; pub struct X86ArchitectureTest { @@ -58,6 +59,7 @@ impl SupportedArchitectureTest for X86ArchitectureTest { .filter(|i| !i.arguments.iter().any(|a| a.is_ptr())) .filter(|i| !i.arguments.iter().any(|a| a.ty.inner_size() == 128)) .filter(|i| !cli_options.skip.contains(&i.name)) + .unique_by(|i| i.name.clone()) .collect::>(); intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); From 6702469fcec5c68d8aedf99af5a94b7fc7c57360 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:23:30 +0530 Subject: [PATCH 37/73] chore: vector types cannot be the type of an individual element in an array. Extra: 1. Added better load fuctions 2. Added an update_simd_len() function to support cases where the bit_len of the element need to be inferred from its partner arguments before calculating the simd_len --- crates/intrinsic-test/src/x86/types.rs | 113 +++++++++++++++++-------- 1 file changed, 77 insertions(+), 36 deletions(-) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index dfaf2adaf4..b07726656a 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -115,6 +115,10 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { // if "type" starts with __m{h/i/}, // then use either _mm_set1_epi64, // _mm256_set1_epi64 or _mm512_set1_epi64 + if type_value.contains("__m64") { + return String::from("*(__m64*)"); + } + let type_val_filtered = type_value .chars() .filter(|c| c.is_numeric()) @@ -126,12 +130,11 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { (Some(bit_len @ (8 | 16 | 32 | 64)), TypeKind::Int(_)) => { format!("epi{bit_len}") } + (Some(bit_len), TypeKind::Mask) => format!("epi{bit_len}"), (Some(16), TypeKind::Float) => format!("ph"), (Some(32), TypeKind::Float) => format!("ps"), (Some(64), TypeKind::Float) => format!("pd"), - (Some(128), TypeKind::Vector) => format!("si128"), - (Some(256), TypeKind::Vector) => format!("si256"), - (Some(512), TypeKind::Vector) => format!("si512"), + (Some(128 | 256 | 512), TypeKind::Vector) => format!("epi32"), _ => unreachable!("Invalid element type for a vector type! {:?}", self.param), }; format!("_mm{type_val_filtered}_loadu_{suffix}") @@ -252,17 +255,18 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { } fn rust_scalar_type(&self) -> String { - let re = Regex::new(r"\__m\d+[a-z]*").unwrap(); - if let Some(match_type) = re.find(self.param.type_data.as_str()) { - match_type.as_str().to_string() - } else { - let prefix = match self.data.kind { - TypeKind::Mask => String::from("__mmask"), - _ => self.kind().rust_prefix().to_string(), - }; + let prefix = match self.data.kind { + TypeKind::Mask => String::from("__mmask"), + TypeKind::Vector => String::from("i"), + _ => self.kind().rust_prefix().to_string(), + }; - format!("{prefix}{bits}", bits = self.inner_size()) - } + let bits = if self.inner_size() >= 128 { + 32 + } else { + self.inner_size() + }; + format!("{prefix}{bits}") } } @@ -311,6 +315,26 @@ impl X86IntrinsicType { }) } + pub fn update_simd_len(&mut self) { + let mut type_processed = self.param.type_data.clone(); + type_processed.retain(|c| c.is_numeric()); + + // check the param.type and extract numeric part if there are double + // underscores. divide this number with bit-len and set this as simd-len. + // Only __m types can have a simd-len. + if self.param.type_data.contains("__m") && !self.param.type_data.contains("__mmask") { + self.data.simd_len = match str::parse::(type_processed.as_str()) { + // If bit_len is None, simd_len will be None. + // Else simd_len will be (num_bits / bit_len). + Ok(num_bits) => self + .data + .bit_len + .and_then(|bit_len| Some(num_bits / bit_len)), + Err(_) => None, + }; + } + } + pub fn from_param(param: &Parameter) -> Result { match Self::from_c(param.type_data.as_str()) { Err(message) => Err(message), @@ -350,22 +374,26 @@ impl X86IntrinsicType { } } - if param.type_data.matches("__mmask").next().is_some() { + if param.type_data.contains("__mmask") { data.bit_len = str::parse::(type_processed.as_str()).ok(); } - // then check the param.type and extract numeric part if there are double - // underscores. divide this number with bit-len and set this as simd-len. - // Only __m types can have a simd-len. - if param.type_data.matches("__m").next().is_some() - && param.type_data.matches("__mmask").next().is_none() - { - data.simd_len = match str::parse::(type_processed.as_str()) { - // If bit_len is None, simd_len will be None. - // Else simd_len will be (num_bits / bit_len). - Ok(num_bits) => data.bit_len.and_then(|bit_len| Some(num_bits / bit_len)), - Err(_) => None, - }; + if vec!["M512", "M256", "M128"].contains(¶m.etype.as_str()) { + match param.type_data.chars().last() { + Some('i') => { + data.kind = TypeKind::Int(Sign::Signed); + data.bit_len = Some(32); + } + Some('h') => { + data.kind = TypeKind::Float; + data.bit_len = Some(16); + } + Some('d') => { + data.kind = TypeKind::Float; + data.bit_len = Some(64); + } + _ => (), + } } // default settings for "void *" parameters @@ -381,22 +409,35 @@ impl X86IntrinsicType { data.bit_len = Some(32); } - // default settings for IMM parameters - if param.etype == "IMM" && param.imm_width > 0 { - data.bit_len = Some(param.imm_width); - } - if param.etype == "IMM" || param.imm_width > 0 || param.imm_type.len() > 0 { + data.kind = TypeKind::Int(Sign::Unsigned); data.constant = true; } - // if param.etype == IMM, then it is a constant. - // else it stays unchanged. - data.constant |= param.etype == "IMM"; - Ok(X86IntrinsicType { + // Rust defaults to signed variants, unless they are explicitly mentioned + // the `type` field are C++ types. + if data.kind == TypeKind::Int(Sign::Unsigned) + && !(param.type_data.contains("unsigned") || param.type_data.contains("uint")) + { + data.kind = TypeKind::Int(Sign::Signed) + } + + // default settings for IMM parameters + if param.etype == "IMM" { + data.bit_len = if param.imm_width > 0 { + Some(param.imm_width) + } else { + Some(8) + } + } + + let mut result = X86IntrinsicType { data, param: param.clone(), - }) + }; + + result.update_simd_len(); + Ok(result) } } // Tile types won't currently reach here, since the intrinsic that involve them From 4cba53aac2050aa8549b0a8245e840e97a1ca17c Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 24 Sep 2025 22:25:58 +0530 Subject: [PATCH 38/73] chore: accomodate for `immwidth` field for constraints extras: 1. call update_simd_len() after inferring bit_len for arguments of certain intrinsics 2. handle the effective bit_len for _mm_mpsadbw_epu8 intrinsic's `imm8` argument which has only 3 bits that are used --- crates/intrinsic-test/src/x86/xml_parser.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 157a37fc9d..90bafbee54 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -84,7 +84,7 @@ fn xml_to_intrinsic( if ty.is_err() { None } else { - let constraint = map_constraints(¶m.imm_type); + let constraint = map_constraints(¶m.imm_type, param.imm_width); let arg = Argument::::new( i, param.var_name.clone(), @@ -117,11 +117,20 @@ fn xml_to_intrinsic( args[index].ty.bit_len = args[0].ty.bit_len; } + args.iter_mut().for_each(|arg| arg.ty.update_simd_len()); + + if name == "_mm_mpsadbw_epu8" { + args.iter_mut() + .filter(|arg| arg.name.contains("imm8")) + .for_each(|arg| arg.ty.bit_len = Some(3)); + } + let arguments = ArgumentList:: { args }; if let Err(message) = result { return Err(Box::from(message)); } + Ok(Intrinsic { name, arguments, From a42841592285515d7d8ff08f6b4894b790ffb04f Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 25 Sep 2025 00:52:58 +0530 Subject: [PATCH 39/73] feat: defined more load functions that are natively not defined (such as arguments with UI16 etype and __m128d type) --- crates/intrinsic-test/src/x86/config.rs | 143 ++++++++++++++++++++++++ crates/intrinsic-test/src/x86/types.rs | 24 ++++ 2 files changed, 167 insertions(+) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index a199a58ff2..76317db5e4 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -7,6 +7,128 @@ pub const NOTICE: &str = "\ pub const F16_FORMATTING_DEF: &str = r#" use std::arch::x86_64::*; +#[inline] +unsafe fn _mm_loadu_ph_to___m128i(mem_addr: *const f16) -> __m128i { + _mm_castph_si128(_mm_loadu_ph(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_ph_to___m256i(mem_addr: *const f16) -> __m256i { + _mm256_castph_si256(_mm256_loadu_ph(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_ph_to___mm512i(mem_addr: *const f16) -> __m512i { + _mm512_castph_si512(_mm512_loadu_ph(mem_addr)) +} + + +#[inline] +unsafe fn _mm_loadu_ps_to___m128h(mem_addr: *const f32) -> __m128h { + _mm_castps_ph(_mm_loadu_ps(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_ps_to___m256h(mem_addr: *const f32) -> __m256h { + _mm256_castps_ph(_mm256_loadu_ps(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_ps_to___m512h(mem_addr: *const f32) -> __m512h { + _mm512_castps_ph(_mm512_loadu_ps(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi16_to___m128d(mem_addr: *const i16) -> __m128d { + _mm_castsi128_pd(_mm_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi16_to___m256d(mem_addr: *const i16) -> __m256d { + _mm256_castsi256_pd(_mm256_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi16_to___m512d(mem_addr: *const i16) -> __m512d { + _mm512_castsi512_pd(_mm512_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi32_to___m128d(mem_addr: *const i32) -> __m128d { + _mm_castsi128_pd(_mm_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi32_to___m256d(mem_addr: *const i32) -> __m256d { + _mm256_castsi256_pd(_mm256_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi32_to___m512d(mem_addr: *const i32) -> __m512d { + _mm512_castsi512_pd(_mm512_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi64_to___m128d(mem_addr: *const i64) -> __m128d { + _mm_castsi128_pd(_mm_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi64_to___m256d(mem_addr: *const i64) -> __m256d { + _mm256_castsi256_pd(_mm256_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi64_to___m512d(mem_addr: *const i64) -> __m512d { + _mm512_castsi512_pd(_mm512_loadu_epi64(mem_addr)) +} + +// === +#[inline] +unsafe fn _mm_loadu_epi16_to___m128(mem_addr: *const i16) -> __m128 { + _mm_castsi128_ps(_mm_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi16_to___m256(mem_addr: *const i16) -> __m256 { + _mm256_castsi256_ps(_mm256_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi16_to___m512(mem_addr: *const i16) -> __m512 { + _mm512_castsi512_ps(_mm512_loadu_epi16(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi32_to___m128(mem_addr: *const i32) -> __m128 { + _mm_castsi128_ps(_mm_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi32_to___m256(mem_addr: *const i32) -> __m256 { + _mm256_castsi256_ps(_mm256_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi32_to___m512(mem_addr: *const i32) -> __m512 { + _mm512_castsi512_ps(_mm512_loadu_epi32(mem_addr)) +} + +#[inline] +unsafe fn _mm_loadu_epi64_to___m128(mem_addr: *const i64) -> __m128 { + _mm_castsi128_ps(_mm_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm256_loadu_epi64_to___m256(mem_addr: *const i64) -> __m256 { + _mm256_castsi256_ps(_mm256_loadu_epi64(mem_addr)) +} + +#[inline] +unsafe fn _mm512_loadu_epi64_to___m512(mem_addr: *const i64) -> __m512 { + _mm512_castsi512_ps(_mm512_loadu_epi64(mem_addr)) +} + #[inline] fn debug_simd_finish( formatter: &mut core::fmt::Formatter<'_>, @@ -50,6 +172,13 @@ impl DebugHexF16 for __m128h { } } +impl DebugHexF16 for __m128i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 8]>(*self) }; + debug_simd_finish(f, "__m128i", &array) + } +} + impl DebugHexF16 for __m256h { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let array = unsafe { core::mem::transmute::<_, [Hex; 16]>(*self) }; @@ -57,12 +186,26 @@ impl DebugHexF16 for __m256h { } } +impl DebugHexF16 for __m256i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 16]>(*self) }; + debug_simd_finish(f, "__m256i", &array) + } +} + impl DebugHexF16 for __m512h { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let array = unsafe { core::mem::transmute::<_, [Hex; 32]>(*self) }; debug_simd_finish(f, "__m512h", &array) } } + +impl DebugHexF16 for __m512i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [Hex; 32]>(*self) }; + debug_simd_finish(f, "__m512i", &array) + } +} "#; pub const LANE_FUNCTION_HELPERS: &str = r#" diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index b07726656a..e4b6e12876 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -127,6 +127,30 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { .replace("64", ""); { let suffix = match (self.bit_len, self.kind) { + (Some(16), TypeKind::Float) + if ["__m128i", "__m256i", "__m512i"] + .contains(&self.param.type_data.as_str()) => + { + format!("ph_to_{}", self.param.type_data) + } + (Some(32), TypeKind::Float) + if ["__m128h", "__m256h", "__m512h"] + .contains(&self.param.type_data.as_str()) => + { + format!("ps_to_{}", self.param.type_data) + } + (Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask) + if ["__m128d", "__m256d", "__m512d"] + .contains(&self.param.type_data.as_str()) => + { + format!("epi{bit_len}_to_{}", self.param.type_data) + } + (Some(bit_len @ (16 | 32 | 64)), TypeKind::Int(_) | TypeKind::Mask) + if ["__m128", "__m256", "__m512"] + .contains(&self.param.type_data.as_str()) => + { + format!("epi{bit_len}_to_{}", self.param.type_data) + } (Some(bit_len @ (8 | 16 | 32 | 64)), TypeKind::Int(_)) => { format!("epi{bit_len}") } From c57e9a2c85286c5ad0e2cdeaa7f78b8babb72972 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 25 Sep 2025 00:53:36 +0530 Subject: [PATCH 40/73] chore: corrected the imm-width correction location for _mm_mpsadbw_epu8 intrinsic --- crates/intrinsic-test/src/x86/xml_parser.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/crates/intrinsic-test/src/x86/xml_parser.rs b/crates/intrinsic-test/src/x86/xml_parser.rs index 90bafbee54..af85118b8a 100644 --- a/crates/intrinsic-test/src/x86/xml_parser.rs +++ b/crates/intrinsic-test/src/x86/xml_parser.rs @@ -84,7 +84,12 @@ fn xml_to_intrinsic( if ty.is_err() { None } else { - let constraint = map_constraints(¶m.imm_type, param.imm_width); + let effective_imm_width = if name == "_mm_mpsadbw_epu8" && param.var_name == "imm8" { + 3 + } else { + param.imm_width + }; + let constraint = map_constraints(¶m.imm_type, effective_imm_width); let arg = Argument::::new( i, param.var_name.clone(), @@ -119,12 +124,6 @@ fn xml_to_intrinsic( args.iter_mut().for_each(|arg| arg.ty.update_simd_len()); - if name == "_mm_mpsadbw_epu8" { - args.iter_mut() - .filter(|arg| arg.name.contains("imm8")) - .for_each(|arg| arg.ty.bit_len = Some(3)); - } - let arguments = ArgumentList:: { args }; if let Err(message) = result { From f58777f792bc9ee021872bbd3296bb7bc79c2a9b Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 25 Sep 2025 00:57:01 +0530 Subject: [PATCH 41/73] feat: added exclusion list to intrinsic-test CI pipeline --- ci/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/run.sh b/ci/run.sh index d8af9b7697..dc5a78723b 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -93,6 +93,7 @@ case ${TARGET} in TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/" TEST_CXX_COMPILER="clang++-19" TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" + TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt export STDARCH_DISABLE_ASSERT_INSTR=1 export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" From d9be63f190e7a55979e693ad7f472b882a90b9a7 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 25 Sep 2025 01:01:40 +0530 Subject: [PATCH 42/73] chore: clean up unused variables --- crates/intrinsic-test/src/common/intrinsic_helpers.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index 43a0e3f5d1..c52bccb693 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -282,7 +282,7 @@ impl IntrinsicType { } IntrinsicType { kind: TypeKind::Vector, - bit_len: Some(bit_len @ (128 | 256 | 512)), + bit_len: Some(128 | 256 | 512), simd_len, vec_len, .. From d6f7ca80e30e46ef9989288afd056b18cf309d10 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 27 Sep 2025 15:29:54 +0530 Subject: [PATCH 43/73] feat: moved cast to architecture-specific definitions --- crates/intrinsic-test/src/arm/config.rs | 8 ++++++++ crates/intrinsic-test/src/common/gen_c.rs | 18 ------------------ crates/intrinsic-test/src/x86/config.rs | 13 +++++++++++++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index 46706e009e..daef7b9b0e 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -8,6 +8,14 @@ pub const POLY128_OSTREAM_DECL: &str = r#" std::ostream& operator<<(std::ostream& os, poly128_t value); std::ostream& operator<<(std::ostream& os, float16_t value); #endif + +// T1 is the `To` type, T2 is the `From` type +template T1 cast(T2 x) {{ + static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); + T1 ret{{}}; + memcpy(&ret, &x, sizeof(T1)); + return ret; +}} "#; pub const POLY128_OSTREAM_DEF: &str = r#" diff --git a/crates/intrinsic-test/src/common/gen_c.rs b/crates/intrinsic-test/src/common/gen_c.rs index 965e229da5..04741e4f80 100644 --- a/crates/intrinsic-test/src/common/gen_c.rs +++ b/crates/intrinsic-test/src/common/gen_c.rs @@ -122,24 +122,6 @@ pub fn write_mod_cpp( writeln!(w, "{}", forward_declarations)?; - writeln!( - w, - r#" -// T1 is the `To` type, T2 is the `From` type -template T1 cast(T2 x) {{ - if (std::is_convertible::value) {{ - return x; - }} else if (sizeof(T1) == sizeof(T2)) {{ - T1 ret{{}}; - memcpy(&ret, &x, sizeof(T1)); - return ret; - }} else {{ - assert("T2 must either be convertable to T1, or have the same size as T1!"); - }} -}} -"# - )?; - for intrinsic in intrinsics { create_c_test_function(w, intrinsic)?; } diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 76317db5e4..cb4c994066 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -269,6 +269,19 @@ std::ostream& operator<<(std::ostream& os, __m512i value) { return os; } +// T1 is the `To` type, T2 is the `From` type +template T1 cast(T2 x) {{ + if (std::is_convertible::value) {{ + return x; + }} else if (sizeof(T1) == sizeof(T2)) {{ + T1 ret{{}}; + memcpy(&ret, &x, sizeof(T1)); + return ret; + }} else {{ + assert("T2 must either be convertable to T1, or have the same size as T1!"); + }} +}} + #define _mm512_extract_intrinsic_test_epi8(m, lane) \ _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) From ff776c49750c800d6c20756ba6f8493382f4d00e Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 27 Sep 2025 16:27:13 +0530 Subject: [PATCH 44/73] fix: remove extra brackets for cast definition in arm/config.rs --- crates/intrinsic-test/src/arm/config.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index daef7b9b0e..7421a6da38 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -10,12 +10,12 @@ std::ostream& operator<<(std::ostream& os, float16_t value); #endif // T1 is the `To` type, T2 is the `From` type -template T1 cast(T2 x) {{ +template T1 cast(T2 x) { static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); - T1 ret{{}}; + T1 ret{}; memcpy(&ret, &x, sizeof(T1)); return ret; -}} +} "#; pub const POLY128_OSTREAM_DEF: &str = r#" From 1047f810b75152a15a1bf6a6e1a2b9afbe55790c Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 27 Sep 2025 16:34:08 +0530 Subject: [PATCH 45/73] make `std::ostream& operator<<(std::ostream& os, float16_t value);` definition available for armv7 also --- crates/intrinsic-test/src/arm/config.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index 7421a6da38..354d8f50b4 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -6,9 +6,10 @@ pub const NOTICE: &str = "\ pub const POLY128_OSTREAM_DECL: &str = r#" #ifdef __aarch64__ std::ostream& operator<<(std::ostream& os, poly128_t value); -std::ostream& operator<<(std::ostream& os, float16_t value); #endif +std::ostream& operator<<(std::ostream& os, float16_t value); + // T1 is the `To` type, T2 is the `From` type template T1 cast(T2 x) { static_assert(sizeof(T1) == sizeof(T2), "sizeof T1 and T2 must be the same"); @@ -33,6 +34,8 @@ std::ostream& operator<<(std::ostream& os, poly128_t value) { return os; } +#endif + std::ostream& operator<<(std::ostream& os, float16_t value) { uint16_t temp = 0; memcpy(&temp, &value, sizeof(float16_t)); @@ -41,7 +44,6 @@ std::ostream& operator<<(std::ostream& os, float16_t value) { os << ss.str(); return os; } -#endif "#; // Format f16 values (and vectors containing them) in a way that is consistent with C. From cc28ab03665352f8e1f0680a8968a8a720877550 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 27 Sep 2025 23:36:44 +0530 Subject: [PATCH 46/73] feat: add missing_x86.txt to filter out intrinsics that cannot be tested currently --- crates/intrinsic-test/missing_x86.txt | 874 ++++++++++++++++++++++++++ 1 file changed, 874 insertions(+) create mode 100644 crates/intrinsic-test/missing_x86.txt diff --git a/crates/intrinsic-test/missing_x86.txt b/crates/intrinsic-test/missing_x86.txt new file mode 100644 index 0000000000..824d36f605 --- /dev/null +++ b/crates/intrinsic-test/missing_x86.txt @@ -0,0 +1,874 @@ +# Are defined under a similar name + +#__bswap_64 +_bswap64 + +# Provides pointer to allocated memory, which is difficult to test +_mm_malloc + +# requires target feature 'waitpkg', but would be inlined into function that is compiled without support for 'waitpkg' +_tpause +_umwait + +# `use of undeclared identifier` error in Clang +_bit_scan_forward +_bit_scan_reverse +_bswap +_castf32_u32 +_castf64_u64 +_castu32_f32 +_castu64_f64 +_lrotl +_lrotr +_may_i_use_cpu_feature +_may_i_use_cpu_feature_ext +_mm256_acos_pd +_mm256_acos_ph +_mm256_acos_ps +_mm256_acosh_pd +_mm256_acosh_ph +_mm256_acosh_ps +_mm256_asin_pd +_mm256_asin_ph +_mm256_asin_ps +_mm256_asinh_pd +_mm256_asinh_ph +_mm256_asinh_ps +_mm256_atan_pd +_mm256_atan_ps +_mm256_atan_ph +_mm256_atan2_pd +_mm256_atan2_ph +_mm256_atan2_ps +_mm256_atanh_pd +_mm256_atanh_ph +_mm256_atanh_ps +_mm256_cbrt_pd +_mm256_cbrt_ph +_mm256_cbrt_ps +_mm256_cdfnorm_pd +_mm256_cdfnorm_ph +_mm256_cdfnorm_ps +_mm256_cdfnorminv_pd +_mm256_cdfnorminv_ph +_mm256_cdfnorminv_ps +_mm256_cexp_ps +_mm256_cos_pd +_mm256_cos_ph +_mm256_cos_ps +_mm256_cosd_pd +_mm256_cosd_ph +_mm256_cosd_ps +_mm256_cosh_pd +_mm256_cosh_ph +_mm256_cosh_ps +_mm256_csqrt_ps +_mm256_div_epi16 +_mm256_div_epi32 +_mm256_div_epi64 +_mm256_div_epi8 +_mm256_div_epu16 +_mm256_div_epu32 +_mm256_div_epu64 +_mm256_div_epu8 +_mm256_dpbssd_epi32 +_mm256_dpbssds_epi32 +_mm256_dpbsud_epi32 +_mm256_dpbsuds_epi32 +_mm256_dpbuud_epi32 +_mm256_dpbuuds_epi32 +_mm256_dpwsud_epi32 +_mm256_dpwsuds_epi32 +_mm256_dpwusd_epi32 +_mm256_dpwusds_epi32 +_mm256_dpwuud_epi32 +_mm256_dpwuuds_epi32 +_mm256_erf_pd +_mm256_erf_ps +_mm256_erfc_pd +_mm256_erfc_ph +_mm256_erfc_ps +_mm256_erfcinv_pd +_mm256_erfcinv_ph +_mm256_erfcinv_ps +_mm256_erfinv_pd +_mm256_erfinv_ph +_mm256_erfinv_ps +_mm256_exp10_pd +_mm256_exp10_ph +_mm256_exp10_ps +_mm256_exp2_pd +_mm256_exp2_ph +_mm256_exp2_ps +_mm256_exp_pd +_mm256_exp_ph +_mm256_exp_ps +_mm256_expm1_pd +_mm256_expm1_ph +_mm256_expm1_ps +_mm256_hypot_pd +_mm256_hypot_ph +_mm256_hypot_ps +_mm256_idiv_epi32 +_mm256_invcbrt_pd +_mm256_invcbrt_ph +_mm256_invcbrt_ps +_mm256_invsqrt_pd +_mm256_invsqrt_ph +_mm256_invsqrt_ps +_mm256_irem_epi32 +_mm256_log10_pd +_mm256_log10_ph +_mm256_log10_ps +_mm256_log1p_pd +_mm256_log1p_ph +_mm256_log1p_ps +_mm256_log2_pd +_mm256_log2_ph +_mm256_log2_ps +_mm256_log_pd +_mm256_log_ph +_mm256_log_ps +_mm256_logb_pd +_mm256_logb_ph +_mm256_logb_ps +_mm256_clog_ps +_mm256_madd52hi_avx_epu64 +_mm256_madd52lo_avx_epu64 +_mm256_erf_ph +_mm256_mask_reduce_add_epi16 +_mm256_mask_reduce_add_epi8 +_mm256_mask_reduce_and_epi16 +_mm256_mask_reduce_and_epi8 +_mm256_mask_reduce_max_epi16 +_mm256_mask_reduce_max_epi8 +_mm256_mask_reduce_max_epu16 +_mm256_mask_reduce_max_epu8 +_mm256_mask_reduce_min_epi16 +_mm256_mask_reduce_min_epi8 +_mm256_mask_reduce_min_epu16 +_mm256_mask_reduce_min_epu8 +_mm256_mask_reduce_mul_epi16 +_mm256_mask_reduce_mul_epi8 +_mm256_mask_reduce_or_epi16 +_mm256_mask_reduce_or_epi8 +_mm512_cosd_ph +_mm512_cosd_ps +_mm512_cosh_pd +_mm512_cosh_ph +_mm512_cosh_ps +_mm512_div_epi16 +_mm512_div_epi32 +_mm512_div_epi64 +_mm512_div_epi8 +_mm512_div_epu16 +_mm512_div_epu32 +_mm512_div_epu64 +_mm512_div_epu8 +_mm512_erf_pd +_mm512_erf_ph +_mm512_erf_ps +_mm512_erfc_pd +_mm512_erfc_ph +_mm512_erfc_ps +_mm512_erfcinv_pd +_mm512_erfcinv_ph +_mm512_erfcinv_ps +_mm512_erfinv_pd +_mm512_erfinv_ph +_mm512_erfinv_ps +_mm512_exp10_pd +_mm512_exp10_ph +_mm512_exp10_ps +_mm512_exp2_pd +_mm512_exp2_ph +_mm512_exp2_ps +_mm512_exp_pd +_mm512_exp_ph +_mm512_exp_ps +_mm512_expm1_pd +_mm512_expm1_ph +_mm512_expm1_ps +_mm512_floor_ph +_mm512_hypot_pd +_mm512_hypot_ph +_mm512_hypot_ps +_mm512_invsqrt_pd +_mm512_invsqrt_ph +_mm512_invsqrt_ps +_mm512_log10_pd +_mm512_log10_ph +_mm512_log10_ps +_mm512_log1p_pd +_mm512_log1p_ph +_mm512_log1p_ps +_mm512_log2_pd +_mm512_log2_ph +_mm512_log2_ps +_mm512_log_pd +_mm512_log_ph +_mm512_log_ps +_mm512_logb_pd +_mm512_logb_ph +_mm512_logb_ps +_mm512_mask_acos_pd +_mm512_mask_acos_ph +_mm512_mask_acos_ps +_mm512_mask_acosh_pd +_mm512_mask_acosh_ph +_mm512_mask_acosh_ps +_mm512_mask_asin_pd +_mm512_mask_asin_ph +_mm512_mask_asin_ps +_mm512_mask_asinh_pd +_mm512_mask_asinh_ph +_mm512_mask_asinh_ps +_mm512_mask_atan2_pd +_mm512_mask_atan2_ps +_mm512_mask_atan_pd +_mm512_mask_atan_ph +_mm512_mask_atan_ph +_mm512_mask_atanh_pd +_mm512_mask_atanh_ph +_mm512_mask_atanh_ps +_mm512_mask_cbrt_pd +_mm512_mask_cbrt_ph +_mm512_mask_cbrt_ps +_mm512_mask_cdfnorm_pd +_mm512_mask_cdfnorm_ph +_mm512_mask_cdfnorm_ps +_mm512_mask_cdfnorminv_pd +_mm512_mask_cdfnorminv_ph +_mm512_mask_cdfnorminv_ps +_mm512_mask_ceil_ph +_mm512_mask_cos_pd +_mm512_mask_cos_ph +_mm512_mask_cos_ps +_mm512_mask_cosd_pd +_mm512_mask_cosd_ph +_mm512_mask_cosd_ps +_mm512_mask_cosh_pd +_mm512_mask_cosh_ph +_mm512_mask_cosh_ps +_mm512_mask_atan_ps +_mm512_cosd_pd +_mm512_cos_ps +_mm512_cos_ph +_mm512_cos_pd +_mm512_mask_div_epi32 +_mm512_mask_div_epu32 +_mm512_mask_erf_pd +_mm512_mask_erf_ph +_mm512_mask_erf_ps +_mm512_mask_erfc_pd +_mm512_mask_erfc_ph +_mm512_mask_erfc_ps +_mm512_mask_erfcinv_pd +_mm512_mask_erfcinv_ph +_mm512_mask_erfcinv_ps +_mm512_mask_erfinv_pd +_mm512_mask_erfinv_ph +_mm512_mask_erfinv_ps +_mm512_mask_exp10_pd +_mm512_mask_exp10_ph +_mm512_mask_exp10_ps +_mm512_mask_exp2_pd +_mm512_mask_exp2_ph +_mm512_mask_exp2_ps +_mm512_mask_exp_pd +_mm512_mask_exp_ph +_mm512_mask_exp_ps +_mm512_mask_expm1_pd +_mm512_mask_expm1_ph +_mm512_mask_expm1_ps +_mm512_mask_floor_ph +_mm512_mask_hypot_pd +_mm512_mask_hypot_ps +_mm512_mask_invsqrt_pd +_mm512_mask_invsqrt_ph +_mm512_mask_invsqrt_ps +_mm512_mask_log10_pd +_mm512_mask_log10_ph +_mm512_mask_log10_ps +_mm512_mask_log1p_pd +_mm512_mask_log1p_ph +_mm512_mask_log1p_ps +_mm512_mask_log2_pd +_mm512_mask_log2_ph +_mm512_mask_log2_ps +_mm512_mask_log_pd +_mm512_mask_log_ph +_mm512_mask_log_ps +_mm512_mask_logb_pd +_mm512_mask_logb_ph +_mm512_mask_logb_ps +_mm512_mask_nearbyint_pd +_mm512_mask_nearbyint_ph +_mm512_mask_nearbyint_ps +_mm512_mask_pow_pd +_mm512_mask_pow_ps +_mm512_mask_recip_pd +_mm512_mask_recip_ph +_mm512_mask_recip_ps +_mm512_mask_rem_epi32 +_mm512_mask_rem_epu32 +_mm512_mask_rint_pd +_mm512_mask_rint_ph +_mm512_mask_rint_ps +_mm512_mask_sin_pd +_mm512_mask_sin_ph +_mm512_mask_sin_ps +_mm512_mask_sind_pd +_mm512_mask_sind_ph +_mm512_mask_sind_ps +_mm512_mask_sinh_pd +_mm512_mask_sinh_ph +_mm512_mask_sinh_ps +_mm512_mask_svml_round_pd +_mm512_mask_svml_round_ph +_mm512_mask_tan_pd +_mm512_mask_tan_ph +_mm512_mask_tan_ps +_mm512_mask_tand_pd +_mm512_mask_tand_ph +_mm512_mask_tand_ps +_mm512_mask_tanh_pd +_mm512_mask_tanh_ph +_mm512_mask_tanh_ps +_mm512_mask_trunc_pd +_mm512_mask_trunc_ph +_mm512_mask_trunc_ps +_mm512_nearbyint_pd +_mm512_nearbyint_ph +_mm512_nearbyint_ps +_mm512_pow_pd +_mm512_pow_ph +_mm512_pow_ps +_mm512_recip_pd +_mm512_recip_ph +_mm512_recip_ps +_mm512_rem_epi16 +_mm512_rem_epi32 +_mm512_rem_epi64 +_mm512_rem_epi8 +_mm512_rem_epu16 +_mm512_rem_epu32 +_mm512_rem_epu64 +_mm512_rem_epu8 +_mm512_rint_pd +_mm512_rint_ph +_mm512_rint_ps +_mm512_sin_pd +_mm512_sin_ph +_mm512_sin_ps +_mm512_sind_pd +_mm512_sind_ph +_mm512_sind_ps +_mm512_sinh_pd +_mm512_sinh_ph +_mm512_sinh_ps +_mm512_svml_round_pd +_mm512_svml_round_ph +_mm512_tan_pd +_mm512_tan_ph +_mm512_tan_ps +_mm512_tand_pd +_mm512_tand_ph +_mm512_tand_ps +_mm512_tanh_pd +_mm512_tanh_ph +_mm512_tanh_ps +_mm512_trunc_pd +_mm512_trunc_ph +_mm512_trunc_ps +_mm_acos_pd +_mm_acos_ph +_mm_acos_ps +_mm_acosh_pd +_mm_acosh_ph +_mm_acosh_ps +_mm_asin_pd +_mm_asin_ph +_mm_asin_ps +_mm_asinh_pd +_mm_asinh_ph +_mm_asinh_ps +_mm_atan2_pd +_mm_atan2_ph +_mm_atan2_ps +_mm_atan_pd +_mm_atan_ph +_mm_atan_ps +_mm_atanh_pd +_mm_atanh_ph +_mm_atanh_ps +_mm_cbrt_pd +_mm_cbrt_ph +_mm_cbrt_ps +_mm_cdfnorm_pd +_mm_cdfnorm_ph +_mm_cdfnorm_ps +_mm_cdfnorminv_pd +_mm_cdfnorminv_ph +_mm_cdfnorminv_ps +_mm_cexp_ps +_mm_clog_ps +_mm_cos_pd +_mm_cos_ph +_mm_cos_ps +_mm_cosd_pd +_mm_cosd_ph +_mm_cosd_ps +_mm_cosh_pd +_mm_cosh_ph +_mm_cosh_ps +_mm_csqrt_ps +_mm_cvtsd_si64x +_mm_cvtsi128_si64x +_mm_cvtsi64x_sd +_mm_cvtsi64x_si128 +_mm_cvttsd_si64x +_mm_div_epi16 +_mm_div_epi32 +_mm_div_epi64 +_mm_div_epi8 +_mm_div_epu16 +_mm_div_epu32 +_mm_div_epu64 +_mm_div_epu8 +_mm_dpbssd_epi32 +_mm_dpbssds_epi32 +_mm_dpbsud_epi32 +_mm_dpbsuds_epi32 +_mm_dpbuud_epi32 +_mm_dpbuuds_epi32 +_mm_dpwsud_epi32 +_mm_dpwsuds_epi32 +_mm_dpwusd_epi32 +_mm_dpwusds_epi32 +_mm_dpwuud_epi32 +_mm_dpwuuds_epi32 +_mm_erf_pd +_mm_erf_ph +_mm_erf_ps +_mm_erfc_pd +_mm_erfc_ph +_mm_erfc_ps +_mm_erfcinv_pd +_mm_erfcinv_ph +_mm_erfcinv_ps +_mm_erfinv_pd +_mm_erfinv_ph +_mm_erfinv_ps +_mm_exp10_pd +_mm_exp10_ph +_mm_exp10_ps +_mm_exp2_pd +_mm_exp2_ph +_mm_exp2_ps +_mm_exp_pd +_mm_exp_ph +_mm_exp_ps +_mm_expm1_pd +_mm_expm1_ph +_mm_expm1_ps +_mm_hypot_pd +_mm_hypot_ph +_mm_hypot_ps +_mm_idiv_epi32 +_mm_invcbrt_pd +_mm_invcbrt_ph +_mm_invcbrt_ps +_mm_invsqrt_pd +_mm_invsqrt_ph +_mm_invsqrt_ps +_mm_irem_epi32 +_mm_log10_pd +_mm_log10_ph +_mm_log10_ps +_mm_log1p_pd +_mm_log1p_ph +_mm_log1p_ps +_mm_log2_pd +_mm_log2_ph +_mm_log2_ps +_mm_log_pd +_mm_log_ph +_mm_log_ps +_mm_logb_pd +_mm_logb_ph +_mm_logb_ps +_mm_madd52hi_avx_epu64 +_mm_madd52lo_avx_epu64 +_mm_mask_reduce_add_epi16 +_mm_mask_reduce_add_epi8 +_mm_mask_reduce_and_epi16 +_mm_mask_reduce_and_epi8 +_mm_mask_reduce_max_epi16 +_mm_mask_reduce_max_epi8 +_mm_mask_reduce_max_epu16 +_mm_mask_reduce_max_epu8 +_mm_mask_reduce_min_epi16 +_mm_mask_reduce_min_epi8 +_mm_mask_reduce_min_epu16 +_mm_mask_reduce_min_epu8 +_mm_mask_reduce_mul_epi16 +_mm_mask_reduce_mul_epi8 +_mm_mask_reduce_or_epi16 +_mm_mask_reduce_or_epi8 +_mm_pow_pd +_mm_pow_ph +_mm_pow_ps +_mm_reduce_add_epi16 +_mm_reduce_add_epi8 +_mm_reduce_and_epi16 +_mm_reduce_and_epi8 +_mm_reduce_max_epi16 +_mm_reduce_max_epi8 +_mm_reduce_max_epu16 +_mm_reduce_max_epu8 +_mm_reduce_min_epi16 +_mm_reduce_min_epi8 +_mm_reduce_min_epu16 +_mm_reduce_min_epu8 +_mm_reduce_mul_epi16 +_mm_reduce_mul_epi8 +_mm_reduce_or_epi16 +_mm_reduce_or_epi8 +_mm_rem_epi16 +_mm_rem_epi32 +_mm_rem_epi64 +_mm_rem_epi8 +_mm_rem_epu16 +_mm_rem_epu32 +_mm_rem_epu64 +_mm_rem_epu8 +_mm_sin_pd +_mm_sin_ph +_mm_sin_ps +_mm_sind_pd +_mm_sind_ph +_mm_sind_ps +_mm_sinh_pd +_mm_sinh_ph +_mm_sinh_ps +_mm_sm3msg1_epi32 +_mm_sm3msg2_epi32 +_mm_sm3rnds2_epi32 +_mm_sm4key4_epi32 +_mm_sm4rnds4_epi32 +_mm_svml_ceil_pd +_mm_svml_ceil_ph +_mm_svml_ceil_ps +_mm_svml_floor_pd +_mm_svml_floor_ph +_mm_svml_floor_ps +_mm_svml_round_pd +_mm_svml_round_ph +_mm_svml_round_ps +_mm_svml_sqrt_pd +_mm_svml_sqrt_ph +_mm_svml_sqrt_ps +_mm_tan_pd +_mm_tan_ph +_mm_tan_ps +_mm_tand_pd +_mm_tand_ph +_mm_tand_ps +_mm_tanh_pd +_mm_tanh_ph +_mm_tanh_ps +_mm_trunc_pd +_mm_trunc_ph +_mm_trunc_ps +_mm_udiv_epi32 +_mm_urem_epi32 +_popcnt32 +_popcnt64 +_rdpmc +_rotl +_rotl64 +_rotr +_rotr64 +_rotwl +_rotwr +_urdmsr + +# Cannot find value in this scope (in Rust testfiles) +_mm512_set1_pch +_mm_abs_pi16 +_mm_abs_pi32 +_mm_abs_pi8 +_mm_add_pi16 +_mm_add_pi32 +_mm_add_pi8 +_mm_add_si64 +_mm_adds_pi16 +_mm_adds_pi8 +_mm_adds_pu16 +_mm_adds_pu8 +_mm_alignr_pi8 +_mm_and_si64 +_mm_andnot_si64 +_mm_avg_pu16 +_mm_avg_pu8 +_mm_cmpeq_pi16 +_mm_cmpeq_pi32 +_mm_cmpeq_pi8 +_mm_cmpgt_pi16 +_mm_cmpgt_pi32 +_mm_cmpgt_pi8 +_mm_cvt_pi2ps +_mm_cvt_ps2pi +_mm_cvtm64_si64 +_mm_cvtpd_pi32 +_mm_cvtpi16_ps +_mm_cvtpi32_pd +_mm_cvtpi32_ps +_mm_cvtpi32x2_ps +_mm_cvtpi8_ps +_mm_cvtps_pi16 +_mm_cvtps_pi32 +_mm_cvtps_pi8 +_mm_cvtpu16_ps +_mm_cvtpu8_ps +_mm_cvtsi32_si64 +_mm_cvtsi64_m64 +_mm_cvtsi64_si32 +_mm_cvtt_ps2pi +_mm_cvttpd_pi32 +_mm512_cbrt_pd +_mm512_cbrt_ph +_mm512_cbrt_ps +_mm512_cdfnorm_pd +_mm512_cdfnorm_ph +_mm512_cdfnorm_ps +_mm512_cdfnorminv_pd +_mm512_cdfnorminv_ph +_mm512_cdfnorminv_ps +_mm512_ceil_pd +_mm512_ceil_ph +_mm512_ceil_ps +_mm512_floor_pd +_mm512_floor_ps +_mm512_mask_ceil_pd +_mm512_mask_ceil_ps +_mm_max_pi16 +_mm_max_pu8 +_mm_min_pi16 +_mm_min_pu8 +_mm_movemask_pi8 +_mm_movepi64_pi64 +_mm_movpi64_epi64 +_mm_mul_su32 +_mm_mulhi_pi16 +_mm_mulhi_pu16 +_mm_mulhrs_pi16 +_mm_mullo_pi16 +_mm_or_si64 +_mm_packs_pi16 +_mm_packs_pi32 +_mm_packs_pu16 +_mm_popcnt_u32 +_mm_popcnt_u64 +_mm_sad_pu8 +_mm_set1_epi64 +_mm_set1_pch +_mm_set1_pi16 +_mm_set1_pi32 +_mm_set1_pi8 +_mm_set_epi64 +_mm_set_pi16 +_mm_set_pi32 +_mm_set_pi8 +_mm_setr_epi64 +_mm_setr_pi16 +_mm_setr_pi32 +_mm_setr_pi8 +_mm_shuffle_pi16 +_mm_shuffle_pi8 +_mm_sign_pi16 +_mm_sign_pi32 +_mm_sign_pi8 +_mm_sll_pi16 +_mm_sll_pi32 +_mm_sll_si64 +_mm_slli_pi16 +_mm_slli_pi32 +_mm_slli_si64 +_mm_sra_pi16 +_mm_sra_pi32 +_mm_srai_pi16 +_mm_srai_pi32 +_mm_srl_pi16 +_mm_srl_pi32 +_mm_srl_si64 +_mm_srli_pi16 +_mm_srli_pi32 +_mm_srli_si64 +_mm_sub_pi16 +_mm_sub_pi32 +_mm_sub_pi8 +_mm_sub_si64 +_mm_subs_pi16 +_mm_subs_pi8 +_mm_subs_pu16 +_mm_subs_pu8 +_mm_unpackhi_pi16 +_mm_unpackhi_pi32 +_mm_unpackhi_pi8 +_mm_unpacklo_pi16 +_mm_unpacklo_pi32 +_mm_unpacklo_pi8 +_mm_xor_si64 +_mm256_pow_pd +_mm256_pow_ph +_mm256_pow_ps +_mm256_rem_epi16 +_mm256_rem_epi32 +_mm256_rem_epi64 +_mm256_rem_epi8 +_mm256_rem_epu16 +_mm256_rem_epu32 +_mm256_rem_epu64 +_mm256_rem_epu8 +_mm256_set1_pch +_mm256_sin_pd +_mm256_sin_ph +_mm256_sin_ps +_mm256_sind_pd +_mm256_sind_ph +_mm256_sind_ps +_mm256_sinh_pd +_mm256_sinh_ph +_mm256_sinh_ps +_mm256_svml_ceil_pd +_mm256_svml_ceil_ph +_mm256_svml_ceil_ps +_mm256_svml_floor_pd +_mm256_svml_floor_ph +_mm256_svml_floor_ps +_mm256_svml_round_pd +_mm256_svml_round_ph +_mm256_svml_round_ps +_mm256_svml_sqrt_pd +_mm256_svml_sqrt_ph +_mm256_svml_sqrt_ps +_mm256_tan_pd +_mm256_tan_ph +_mm256_tan_ps +_mm256_tand_pd +_mm256_tand_ph +_mm256_tand_ps +_mm256_tanh_pd +_mm256_tanh_ph +_mm256_tanh_ps +_mm256_trunc_pd +_mm256_trunc_ph +_mm256_trunc_ps +_mm256_udiv_epi32 +_mm256_urem_epi32 +_mm512_acos_pd +_mm512_acos_ph +_mm512_acos_ps +_mm512_acosh_pd +_mm512_acosh_ph +_mm512_acosh_ps +_mm_cvttps_pi32 +_mm_extract_pi16 +_mm_hadd_pi16 +_mm_hadd_pi32 +_mm_hadds_pi16 +_mm_hsub_pi16 +_mm_hsub_pi32 +_mm_hsubs_pi16 +_mm_insert_pi16 +_mm_madd_pi16 +_mm_maddubs_pi16 +_mm512_asin_pd +_mm512_asin_ph +_mm512_asin_ps +_mm512_asinh_pd +_mm512_asinh_ph +_mm512_asinh_ps +_mm512_atan2_pd +_mm512_atan2_ph +_mm512_atan2_ps +_mm512_atan_pd +_mm512_atan_ph +_mm512_atan_ps +_mm512_atanh_pd +_mm512_atanh_ph +_mm512_atanh_ps +_cvtsh_ss +_cvtss_sh +_m_from_int +_m_from_int64 +_m_packssdw +_m_packsswb +_m_packuswb +_m_paddb +_m_paddd +_m_paddsb +_m_paddsw +_m_paddusb +_m_paddusw +_m_paddw +_m_pand +_m_pandn +_m_pavgb +_m_pavgw +_m_pcmpeqb +_m_pcmpeqd +_m_pcmpeqw +_m_pcmpgtb +_m_pcmpgtd +_m_pcmpgtw +_m_pextrw +_m_pinsrw +_m_pmaddwd +_m_pmaxsw +_m_pmaxub +_m_pminsw +_m_pminub +_m_pmovmskb +_m_pmulhuw +_m_pmulhw +_m_pmullw +_m_por +_m_psadbw +_m_pshufw +_m_pslld +_m_pslldi +_m_psllq +_m_psllqi +_m_psllw +_m_psllwi +_m_psrad +_m_psradi +_m_psraw +_m_psrawi +_m_psrld +_m_psrldi +_m_psrlq +_m_psrlqi +_m_psrlw +_m_psrlwi +_m_psubb +_m_psubd +_m_psubsb +_m_psubsw +_m_psubusb +_m_psubusw +_m_psubw +_m_punpckhbw +_m_punpckhdq +_m_punpckhwd +_m_punpcklbw +_m_punpckldq +_m_punpcklwd +_m_pxor +_m_to_int +_m_to_int64 +_mm512_mask_floor_pd +_mm512_mask_floor_ps \ No newline at end of file From 153191f1b99232f41933a18c366103440fc68759 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 28 Sep 2025 01:37:09 +0530 Subject: [PATCH 47/73] feat: added custom helper functions (that helped load intrinsic arguments in Rust) to C++ testfiles. Also added extra compilation flags --- crates/intrinsic-test/src/x86/compile.rs | 2 + crates/intrinsic-test/src/x86/config.rs | 51 ++++++++++++++++++++---- 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index 6eaab86150..c0d2b12d0e 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -24,6 +24,8 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { "-mavx512dq", "-mavx512cd", "-mavx512fp16", + "-msha512", + "-msm4", "-ferror-limit=1000", ]); diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index cb4c994066..778448a3bd 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -270,17 +270,17 @@ std::ostream& operator<<(std::ostream& os, __m512i value) { } // T1 is the `To` type, T2 is the `From` type -template T1 cast(T2 x) {{ - if (std::is_convertible::value) {{ +template T1 cast(T2 x) { + if (std::is_convertible::value) { return x; - }} else if (sizeof(T1) == sizeof(T2)) {{ - T1 ret{{}}; + } else if (sizeof(T1) == sizeof(T2)) { + T1 ret{}; memcpy(&ret, &x, sizeof(T1)); return ret; - }} else {{ + } else { assert("T2 must either be convertable to T1, or have the same size as T1!"); - }} -}} + } +} #define _mm512_extract_intrinsic_test_epi8(m, lane) \ _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) @@ -299,6 +299,43 @@ template T1 cast(T2 x) {{ #define _mm64_extract_intrinsic_test_epi32(m, lane) \ _mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32)) + +// Load f16 (__m128h) and cast to integer (__m128i) +#define _mm_loadu_ph_to___m128i(mem_addr) _mm_castph_si128(_mm_loadu_ph(mem_addr)) +#define _mm256_loadu_ph_to___m256i(mem_addr) _mm256_castph_si256(_mm256_loadu_ph(mem_addr)) +#define _mm512_loadu_ph_to___m512i(mem_addr) _mm512_castph_si512(_mm512_loadu_ph(mem_addr)) + +// Load f32 (__m128) and cast to f16 (__m128h) +#define _mm_loadu_ps_to___m128h(mem_addr) _mm_castps_ph(_mm_loadu_ps(mem_addr)) +#define _mm256_loadu_ps_to___m256h(mem_addr) _mm256_castps_ph(_mm256_loadu_ps(mem_addr)) +#define _mm512_loadu_ps_to___m512h(mem_addr) _mm512_castps_ph(_mm512_loadu_ps(mem_addr)) + +// Load integer types and cast to double (__m128d, __m256d, __m512d) +#define _mm_loadu_epi16_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) +#define _mm256_loadu_epi16_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) +#define _mm512_loadu_epi16_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + +#define _mm_loadu_epi32_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) +#define _mm256_loadu_epi32_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) +#define _mm512_loadu_epi32_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + +#define _mm_loadu_epi64_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) +#define _mm256_loadu_epi64_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) +#define _mm512_loadu_epi64_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + +// Load integer types and cast to float (__m128, __m256, __m512) +#define _mm_loadu_epi16_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) +#define _mm256_loadu_epi16_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) +#define _mm512_loadu_epi16_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + +#define _mm_loadu_epi32_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) +#define _mm256_loadu_epi32_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) +#define _mm512_loadu_epi32_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + +#define _mm_loadu_epi64_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) +#define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) +#define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + "#; pub const X86_CONFIGURATIONS: &str = r#" From 527adddff3abbcd2568f57cc72ac4c85d603936a Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 28 Sep 2025 13:24:06 +0530 Subject: [PATCH 48/73] chore: add more compiler flags for compiling x86 intrinsics in C++ --- crates/intrinsic-test/src/x86/compile.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index c0d2b12d0e..27fd5d831c 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -26,7 +26,15 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { "-mavx512fp16", "-msha512", "-msm4", + "-mavxvnni", + "-mavx512bitalg", + "-mavx512ifma", + "-mavx512vbmi", + "-mavx512vbmi2", + "-mavx512vnni", + "-mavx512vpopcntdq", "-ferror-limit=1000", + "-std=c++17", ]); if !cpp_compiler.contains("clang") { From 525249f47d8a69fffc3f1b1e070b686ccc1163f9 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 29 Sep 2025 00:13:53 +0530 Subject: [PATCH 49/73] chore: add verbose cli option to C++ compiler --- crates/intrinsic-test/src/common/compile_c.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/common/compile_c.rs b/crates/intrinsic-test/src/common/compile_c.rs index 258e418165..fa78b332a7 100644 --- a/crates/intrinsic-test/src/common/compile_c.rs +++ b/crates/intrinsic-test/src/common/compile_c.rs @@ -119,7 +119,7 @@ impl CppCompilation { output: &str, ) -> std::io::Result { let mut cmd = clone_command(&self.0); - cmd.args([input, "-c", "-o", output]); + cmd.args([input, "-v", "-c", "-o", output]); cmd.output() } From d9f8159f5a3e8cf25e8b833b426e0b63e1802df2 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 29 Sep 2025 01:23:15 +0530 Subject: [PATCH 50/73] feat: add clang to dockerfile and change clang++-19 to clang++ --- ci/docker/x86_64-unknown-linux-gnu/Dockerfile | 4 +++- ci/run.sh | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile index bbebe2d7fa..a35136a9c1 100644 --- a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile +++ b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile @@ -6,7 +6,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ make \ ca-certificates \ wget \ - xz-utils + xz-utils \ + clang \ + lld RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.58.0-2025-06-16-lin.tar.xz -O sde.tar.xz RUN mkdir intel-sde diff --git a/ci/run.sh b/ci/run.sh index dc5a78723b..c8dc6a2e8b 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -91,7 +91,7 @@ fi case ${TARGET} in x86_64-unknown-linux-gnu) TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/" - TEST_CXX_COMPILER="clang++-19" + TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt export STDARCH_DISABLE_ASSERT_INSTR=1 From 3f3e3c4fb07822964a15ad3abbcc55c39696b924 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 29 Sep 2025 01:45:13 +0530 Subject: [PATCH 51/73] fix: add `libstdc++-dev` to fix `iostream not found` error --- ci/docker/x86_64-unknown-linux-gnu/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile index a35136a9c1..2743896375 100644 --- a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile +++ b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile @@ -8,6 +8,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ wget \ xz-utils \ clang \ + libstdc++-14-dev \ + build-essential \ lld RUN wget http://ci-mirrors.rust-lang.org/stdarch/sde-external-9.58.0-2025-06-16-lin.tar.xz -O sde.tar.xz From 72750f7bce65c6bf1e49a84591cb420f37196b12 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 29 Sep 2025 16:22:37 +0530 Subject: [PATCH 52/73] fix: making compilation step run one by one to prevent the process from being killed. Also separated declarations and definitions for C++ testfiles. --- crates/intrinsic-test/src/common/mod.rs | 12 +- crates/intrinsic-test/src/x86/config.rs | 165 ++++++++++++------------ crates/intrinsic-test/src/x86/mod.rs | 4 +- 3 files changed, 99 insertions(+), 82 deletions(-) diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index 37a48654e4..f38f0e5a7c 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -72,19 +72,29 @@ pub trait SupportedArchitectureTest { return Err(format!("Error writing to mod_{i}.cpp: {error:?}")); } + println!("Finished writing mod_{i}.cpp"); + + Ok(()) + }) + .collect::>() + .unwrap(); + + (0..chunk_count) + .map(|i| { // compile this cpp file into a .o file. // // This is done because `cpp_compiler_wrapped` is None when // the --generate-only flag is passed + println!("compiling mod_{i}.cpp"); if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { let compile_output = cpp_compiler .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o")); + println!("finished compiling mod_{i}.cpp"); if let Err(compile_error) = compile_output { return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}")); } } - Ok(()) }) .collect::>() diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 778448a3bd..159851c104 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -208,18 +208,93 @@ impl DebugHexF16 for __m512i { } "#; -pub const LANE_FUNCTION_HELPERS: &str = r#" -typedef _Float16 float16_t; -typedef float float32_t; -typedef double float64_t; - -#define __int64 long long -#define __int32 int +pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" +#ifndef X86_DECLARATIONS +#define X86_DECLARATIONS + typedef _Float16 float16_t; + typedef float float32_t; + typedef double float64_t; + + #define __int64 long long + #define __int32 int -std::ostream& operator<<(std::ostream& os, _Float16 value); -std::ostream& operator<<(std::ostream& os, __m128i value); -std::ostream& operator<<(std::ostream& os, __m256i value); -std::ostream& operator<<(std::ostream& os, __m512i value); + std::ostream& operator<<(std::ostream& os, _Float16 value); + std::ostream& operator<<(std::ostream& os, __m128i value); + std::ostream& operator<<(std::ostream& os, __m256i value); + std::ostream& operator<<(std::ostream& os, __m512i value); + + #define _mm512_extract_intrinsic_test_epi8(m, lane) \ + _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) + + #define _mm512_extract_intrinsic_test_epi16(m, lane) \ + _mm_extract_epi16(_mm512_extracti64x2_epi64((m), (lane) / 8), (lane) % 8) + + #define _mm512_extract_intrinsic_test_epi32(m, lane) \ + _mm_extract_epi32(_mm512_extracti64x2_epi64((m), (lane) / 4), (lane) % 4) + + #define _mm512_extract_intrinsic_test_epi64(m, lane) \ + _mm_extract_epi64(_mm512_extracti64x2_epi64((m), (lane) / 2), (lane) % 2) + + #define _mm64_extract_intrinsic_test_epi8(m, lane) \ + ((_mm_extract_pi16((m), (lane) / 2) >> (((lane) % 2) * 8)) & 0xFF) + + #define _mm64_extract_intrinsic_test_epi32(m, lane) \ + _mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32)) + + // Load f16 (__m128h) and cast to integer (__m128i) + #define _mm_loadu_ph_to___m128i(mem_addr) _mm_castph_si128(_mm_loadu_ph(mem_addr)) + #define _mm256_loadu_ph_to___m256i(mem_addr) _mm256_castph_si256(_mm256_loadu_ph(mem_addr)) + #define _mm512_loadu_ph_to___m512i(mem_addr) _mm512_castph_si512(_mm512_loadu_ph(mem_addr)) + + // Load f32 (__m128) and cast to f16 (__m128h) + #define _mm_loadu_ps_to___m128h(mem_addr) _mm_castps_ph(_mm_loadu_ps(mem_addr)) + #define _mm256_loadu_ps_to___m256h(mem_addr) _mm256_castps_ph(_mm256_loadu_ps(mem_addr)) + #define _mm512_loadu_ps_to___m512h(mem_addr) _mm512_castps_ph(_mm512_loadu_ps(mem_addr)) + + // Load integer types and cast to double (__m128d, __m256d, __m512d) + #define _mm_loadu_epi16_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi16_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi16_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi32_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi32_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi32_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi64_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi64_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi64_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + // Load integer types and cast to float (__m128, __m256, __m512) + #define _mm_loadu_epi16_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi16_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi16_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi32_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi32_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi32_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + #define _mm_loadu_epi64_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) + #define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) + #define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) + + + // T1 is the `To` type, T2 is the `From` type + template T1 cast(T2 x) { + if constexpr (std::is_convertible_v) { + return x; + } else if constexpr (sizeof(T1) == sizeof(T2)) { + T1 ret{}; + std::memcpy(&ret, &x, sizeof(T1)); + return ret; + } else { + static_assert(sizeof(T1) == sizeof(T2) || std::is_convertible_v, + "T2 must either be convertible to T1, or have the same size as T1!"); + return T1{}; + } + } +#endif +"#; +pub const PLATFORM_C_DEFINITIONS: &str = r#" std::ostream& operator<<(std::ostream& os, _Float16 value) { uint16_t temp = 0; @@ -268,74 +343,6 @@ std::ostream& operator<<(std::ostream& os, __m512i value) { os << ss.str(); return os; } - -// T1 is the `To` type, T2 is the `From` type -template T1 cast(T2 x) { - if (std::is_convertible::value) { - return x; - } else if (sizeof(T1) == sizeof(T2)) { - T1 ret{}; - memcpy(&ret, &x, sizeof(T1)); - return ret; - } else { - assert("T2 must either be convertable to T1, or have the same size as T1!"); - } -} - -#define _mm512_extract_intrinsic_test_epi8(m, lane) \ - _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) - -#define _mm512_extract_intrinsic_test_epi16(m, lane) \ - _mm_extract_epi16(_mm512_extracti64x2_epi64((m), (lane) / 8), (lane) % 8) - -#define _mm512_extract_intrinsic_test_epi32(m, lane) \ - _mm_extract_epi32(_mm512_extracti64x2_epi64((m), (lane) / 4), (lane) % 4) - -#define _mm512_extract_intrinsic_test_epi64(m, lane) \ - _mm_extract_epi64(_mm512_extracti64x2_epi64((m), (lane) / 2), (lane) % 2) - -#define _mm64_extract_intrinsic_test_epi8(m, lane) \ - ((_mm_extract_pi16((m), (lane) / 2) >> (((lane) % 2) * 8)) & 0xFF) - -#define _mm64_extract_intrinsic_test_epi32(m, lane) \ - _mm_cvtsi64_si32(_mm_srli_si64(m, (lane) * 32)) - -// Load f16 (__m128h) and cast to integer (__m128i) -#define _mm_loadu_ph_to___m128i(mem_addr) _mm_castph_si128(_mm_loadu_ph(mem_addr)) -#define _mm256_loadu_ph_to___m256i(mem_addr) _mm256_castph_si256(_mm256_loadu_ph(mem_addr)) -#define _mm512_loadu_ph_to___m512i(mem_addr) _mm512_castph_si512(_mm512_loadu_ph(mem_addr)) - -// Load f32 (__m128) and cast to f16 (__m128h) -#define _mm_loadu_ps_to___m128h(mem_addr) _mm_castps_ph(_mm_loadu_ps(mem_addr)) -#define _mm256_loadu_ps_to___m256h(mem_addr) _mm256_castps_ph(_mm256_loadu_ps(mem_addr)) -#define _mm512_loadu_ps_to___m512h(mem_addr) _mm512_castps_ph(_mm512_loadu_ps(mem_addr)) - -// Load integer types and cast to double (__m128d, __m256d, __m512d) -#define _mm_loadu_epi16_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) -#define _mm256_loadu_epi16_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) -#define _mm512_loadu_epi16_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) - -#define _mm_loadu_epi32_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) -#define _mm256_loadu_epi32_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) -#define _mm512_loadu_epi32_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) - -#define _mm_loadu_epi64_to___m128d(mem_addr) _mm_castsi128_pd(_mm_loadu_si128((__m128i const*)(mem_addr))) -#define _mm256_loadu_epi64_to___m256d(mem_addr) _mm256_castsi256_pd(_mm256_loadu_si256((__m256i const*)(mem_addr))) -#define _mm512_loadu_epi64_to___m512d(mem_addr) _mm512_castsi512_pd(_mm512_loadu_si512((__m512i const*)(mem_addr))) - -// Load integer types and cast to float (__m128, __m256, __m512) -#define _mm_loadu_epi16_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) -#define _mm256_loadu_epi16_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) -#define _mm512_loadu_epi16_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) - -#define _mm_loadu_epi32_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) -#define _mm256_loadu_epi32_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) -#define _mm512_loadu_epi32_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) - -#define _mm_loadu_epi64_to___m128(mem_addr) _mm_castsi128_ps(_mm_loadu_si128((__m128i const*)(mem_addr))) -#define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) -#define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) - "#; pub const X86_CONFIGURATIONS: &str = r#" diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index d5ebd960b3..1eac6fb5f9 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -37,8 +37,8 @@ impl SupportedArchitectureTest for X86ArchitectureTest { const NOTICE: &str = config::NOTICE; const PLATFORM_C_HEADERS: &[&str] = &["immintrin.h", "cstddef", "cstdint"]; - const PLATFORM_C_DEFINITIONS: &str = config::LANE_FUNCTION_HELPERS; - const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::LANE_FUNCTION_HELPERS; + const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS; + const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS; const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF; const PLATFORM_RUST_CFGS: &str = config::X86_CONFIGURATIONS; From f188d95948c698510069be91d6863f69929d0b3e Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 30 Sep 2025 01:38:52 +0530 Subject: [PATCH 53/73] feat: attempting compilation of smaller chunks for faster parallel processing --- crates/intrinsic-test/src/common/mod.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index f38f0e5a7c..67bc81f99c 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -49,7 +49,7 @@ pub trait SupportedArchitectureTest { fn cpp_compilation(&self) -> Option; fn build_c_file(&self) -> bool { - let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len()); + let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 100); let cpp_compiler_wrapped = self.cpp_compilation(); @@ -72,15 +72,6 @@ pub trait SupportedArchitectureTest { return Err(format!("Error writing to mod_{i}.cpp: {error:?}")); } - println!("Finished writing mod_{i}.cpp"); - - Ok(()) - }) - .collect::>() - .unwrap(); - - (0..chunk_count) - .map(|i| { // compile this cpp file into a .o file. // // This is done because `cpp_compiler_wrapped` is None when @@ -135,7 +126,7 @@ pub trait SupportedArchitectureTest { fn build_rust_file(&self) -> bool { std::fs::create_dir_all("rust_programs/src").unwrap(); - let (chunk_size, chunk_count) = chunk_info(self.intrinsics().len()); + let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 100); let mut cargo = File::create("rust_programs/Cargo.toml").unwrap(); write_bin_cargo_toml(&mut cargo, chunk_count).unwrap(); @@ -205,9 +196,13 @@ pub trait SupportedArchitectureTest { } } -pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) { - let available_parallelism = std::thread::available_parallelism().unwrap().get(); - let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count)); +// pub fn chunk_info(intrinsic_count: usize) -> (usize, usize) { +// let available_parallelism = std::thread::available_parallelism().unwrap().get(); +// let chunk_size = intrinsic_count.div_ceil(Ord::min(available_parallelism, intrinsic_count)); + +// (chunk_size, intrinsic_count.div_ceil(chunk_size)) +// } +pub fn manual_chunk(intrinsic_count: usize, chunk_size: usize) -> (usize, usize) { (chunk_size, intrinsic_count.div_ceil(chunk_size)) } From 4cb147070880b0fd27c9973767a8032cb2c5ffcc Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Tue, 30 Sep 2025 14:32:14 +0530 Subject: [PATCH 54/73] feat: add c_programs to PATH and increase chunk size to 400 --- ci/run.sh | 2 ++ crates/intrinsic-test/src/common/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index c8dc6a2e8b..a74769c56d 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -95,6 +95,8 @@ case ${TARGET} in TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt export STDARCH_DISABLE_ASSERT_INSTR=1 + PATH="$PATH":"$(pwd)"/c_programs + export PATH export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" cargo_test "${PROFILE}" diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index 67bc81f99c..86a7876807 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -49,7 +49,7 @@ pub trait SupportedArchitectureTest { fn cpp_compilation(&self) -> Option; fn build_c_file(&self) -> bool { - let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 100); + let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400); let cpp_compiler_wrapped = self.cpp_compilation(); @@ -126,7 +126,7 @@ pub trait SupportedArchitectureTest { fn build_rust_file(&self) -> bool { std::fs::create_dir_all("rust_programs/src").unwrap(); - let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 100); + let (chunk_size, chunk_count) = manual_chunk(self.intrinsics().len(), 400); let mut cargo = File::create("rust_programs/Cargo.toml").unwrap(); write_bin_cargo_toml(&mut cargo, chunk_count).unwrap(); From 41263b47f497bce2762bed8b0059b6a0ba9779cd Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Fri, 3 Oct 2025 01:17:03 +0530 Subject: [PATCH 55/73] feat: display __mmask8 values so that non-utf8 values are not displayed --- crates/intrinsic-test/src/x86/config.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 159851c104..2f859af464 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -222,6 +222,7 @@ pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" std::ostream& operator<<(std::ostream& os, __m128i value); std::ostream& operator<<(std::ostream& os, __m256i value); std::ostream& operator<<(std::ostream& os, __m512i value); + std::ostream& operator<<(std::ostream& os, __mmask8 value); #define _mm512_extract_intrinsic_test_epi8(m, lane) \ _mm_extract_epi8(_mm512_extracti64x2_epi64((m), (lane) / 16), (lane) % 16) @@ -343,6 +344,11 @@ std::ostream& operator<<(std::ostream& os, __m512i value) { os << ss.str(); return os; } + +std::ostream& operator<<(std::ostream& os, __mmask8 value) { + os << static_cast(value); + return os; +} "#; pub const X86_CONFIGURATIONS: &str = r#" From b68d5575a4ca0ec94ac98591d9a5d50ff7341ab4 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Fri, 3 Oct 2025 18:39:53 +0530 Subject: [PATCH 56/73] feat: add formatting for __m128i, __m256i, __m512i types that is similar to C++ version of the same. --- crates/intrinsic-test/src/common/gen_rust.rs | 3 +- crates/intrinsic-test/src/common/intrinsic.rs | 16 +------ .../src/common/intrinsic_helpers.rs | 18 ++++++++ crates/intrinsic-test/src/x86/config.rs | 44 ++++++++++++++++++- crates/intrinsic-test/src/x86/types.rs | 12 +++++ 5 files changed, 75 insertions(+), 18 deletions(-) diff --git a/crates/intrinsic-test/src/common/gen_rust.rs b/crates/intrinsic-test/src/common/gen_rust.rs index 3b330879e0..27f49a37b1 100644 --- a/crates/intrinsic-test/src/common/gen_rust.rs +++ b/crates/intrinsic-test/src/common/gen_rust.rs @@ -4,7 +4,6 @@ use std::process::Command; use crate::common::intrinsic::Intrinsic; use super::indentation::Indentation; -use super::intrinsic::format_f16_return_value; use super::intrinsic_helpers::IntrinsicTypeDefinition; // The number of times each intrinsic will be called. @@ -233,7 +232,6 @@ pub fn generate_rust_test_loop( } } - let return_value = format_f16_return_value(intrinsic); let indentation2 = indentation.nested(); let indentation3 = indentation2.nested(); writeln!( @@ -250,6 +248,7 @@ pub fn generate_rust_test_loop( }}", loaded_args = intrinsic.arguments.load_values_rust(indentation3), args = intrinsic.arguments.as_call_param_rust(), + return_value = intrinsic.results.print_result_rust(), ) } diff --git a/crates/intrinsic-test/src/common/intrinsic.rs b/crates/intrinsic-test/src/common/intrinsic.rs index 95276d19b7..81f6d6d8b5 100644 --- a/crates/intrinsic-test/src/common/intrinsic.rs +++ b/crates/intrinsic-test/src/common/intrinsic.rs @@ -1,5 +1,5 @@ use super::argument::ArgumentList; -use super::intrinsic_helpers::{IntrinsicTypeDefinition, TypeKind}; +use super::intrinsic_helpers::IntrinsicTypeDefinition; /// An intrinsic #[derive(Debug, PartialEq, Clone)] @@ -16,17 +16,3 @@ pub struct Intrinsic { /// Any architecture-specific tags. pub arch_tags: Vec, } - -pub fn format_f16_return_value(intrinsic: &Intrinsic) -> String { - // the `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses - // a string representation of the output value to compare. In C, f16 values are currently printed - // as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print - // them as decimal floating point values. To keep the intrinsics tests working, for now, format - // vectors containing f16 values like C prints them. - let return_value = match intrinsic.results.kind() { - TypeKind::Float if intrinsic.results.inner_size() == 16 => "debug_f16(__return_value)", - _ => "format_args!(\"{__return_value:.150?}\")", - }; - - String::from(return_value) -} diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index c52bccb693..c0b9ed2535 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -365,6 +365,24 @@ pub trait IntrinsicTypeDefinition: Deref { /// there is an int i in scope which is the current pass number. fn print_result_c(&self, indentation: Indentation, additional: &str) -> String; + /// Generates a std::cout for the intrinsics results that will match the + /// rust debug output format for the return type. The generated line assumes + /// there is an int i in scope which is the current pass number. + /// + /// The `intrinsic-test` crate compares the output of C and Rust intrinsics. Currently, It uses + /// a string representation of the output value to compare. In C, f16 values are currently printed + /// as hexadecimal integers. Since https://github.com/rust-lang/rust/pull/127013, rust does print + /// them as decimal floating point values. To keep the intrinsics tests working, for now, format + /// vectors containing f16 values like C prints them. + fn print_result_rust(&self) -> String { + let return_value = match self.kind() { + TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)", + _ => "format_args!(\"{__return_value:.150?}\")", + }; + + String::from(return_value) + } + /// To enable architecture-specific logic fn rust_scalar_type(&self) -> String { format!( diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 2f859af464..bf2a37d78e 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -206,7 +206,49 @@ impl DebugHexF16 for __m512i { debug_simd_finish(f, "__m512i", &array) } } - "#; + +trait DebugI16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result; +} + +impl DebugI16 for i16 { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{}", self) + } +} + +impl DebugI16 for __m128i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [i16; 8]>(*self) }; + debug_simd_finish(f, "__m128i", &array) + } +} + +impl DebugI16 for __m256i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [i16; 16]>(*self) }; + debug_simd_finish(f, "__m256i", &array) + } +} + +impl DebugI16 for __m512i { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let array = unsafe { core::mem::transmute::<_, [i16; 32]>(*self) }; + debug_simd_finish(f, "__m512i", &array) + } +} + +fn debug_i16(x: T) -> impl core::fmt::Debug { + struct DebugWrapper(T); + impl core::fmt::Debug for DebugWrapper { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.0.fmt(f) + } + } + DebugWrapper(x) +} + +"#; pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" #ifndef X86_DECLARATIONS diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index e4b6e12876..5631a01824 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -292,6 +292,18 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { }; format!("{prefix}{bits}") } + + fn print_result_rust(&self) -> String { + let return_value = match self.kind() { + TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)", + _ if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) => { + "debug_i16(__return_value)" + } + _ => "format_args!(\"{__return_value:.150?}\")", + }; + + String::from(return_value) + } } impl X86IntrinsicType { From e13929947ea5fcd423c1d99478bc07196f7e5972 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Mon, 6 Oct 2025 00:18:44 +0530 Subject: [PATCH 57/73] feat: make the debug_i16 into a generic debug_as function that adapts to base type --- crates/intrinsic-test/src/x86/config.rs | 54 ++++++++++++------------- crates/intrinsic-test/src/x86/types.rs | 12 +++--- 2 files changed, 34 insertions(+), 32 deletions(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index bf2a37d78e..bf9f066404 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -207,45 +207,45 @@ impl DebugHexF16 for __m512i { } } -trait DebugI16 { +trait DebugAs { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result; } -impl DebugI16 for i16 { +impl DebugAs for T { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{}", self) } } -impl DebugI16 for __m128i { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - let array = unsafe { core::mem::transmute::<_, [i16; 8]>(*self) }; - debug_simd_finish(f, "__m128i", &array) - } -} - -impl DebugI16 for __m256i { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - let array = unsafe { core::mem::transmute::<_, [i16; 16]>(*self) }; - debug_simd_finish(f, "__m256i", &array) - } -} - -impl DebugI16 for __m512i { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - let array = unsafe { core::mem::transmute::<_, [i16; 32]>(*self) }; - debug_simd_finish(f, "__m512i", &array) - } -} - -fn debug_i16(x: T) -> impl core::fmt::Debug { - struct DebugWrapper(T); - impl core::fmt::Debug for DebugWrapper { +macro_rules! impl_debug_as { + ($simd:ty, $name:expr, $bits:expr, [$($type:ty),+]) => { + $( + impl DebugAs<$type> for $simd { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + const ELEMENT_BITS: usize = core::mem::size_of::<$type>() * 8; + const NUM_ELEMENTS: usize = $bits / ELEMENT_BITS; + let array = unsafe { core::mem::transmute::<_, [$type; NUM_ELEMENTS]>(*self) }; + debug_simd_finish(f, $name, &array) + } + } + )+ + }; +} + +impl_debug_as!(__m128i, "__m128i", 128, [u8, i8, u16, i16, u32, i32, u64, i64]); +impl_debug_as!(__m256i, "__m256i", 256, [u8, i8, u16, i16, u32, i32, u64, i64]); +impl_debug_as!(__m512i, "__m512i", 512, [u8, i8, u16, i16, u32, i32, u64, i64]); + +fn debug_as(x: V) -> impl core::fmt::Debug +where V: DebugAs +{ + struct DebugWrapper(V, core::marker::PhantomData); + impl, T> core::fmt::Debug for DebugWrapper { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { self.0.fmt(f) } } - DebugWrapper(x) + DebugWrapper(x, core::marker::PhantomData) } "#; diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 5631a01824..94600c989d 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -295,14 +295,16 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { fn print_result_rust(&self) -> String { let return_value = match self.kind() { - TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)", - _ if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) => { - "debug_i16(__return_value)" + TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)".to_string(), + TypeKind::Int(_) + if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) => + { + format!("debug_as::<_, u{}>(__return_value)", self.inner_size()) } - _ => "format_args!(\"{__return_value:.150?}\")", + _ => "format_args!(\"{__return_value:.150?}\")".to_string(), }; - String::from(return_value) + return_value } } From c2294ff6a1a2b169badd4da5011b788fbec6cf2a Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 8 Oct 2025 18:14:18 +0530 Subject: [PATCH 58/73] feat: casting the results of the lane function by preserving the bits instead of letting C++ do it (and potentially change the bits) --- crates/intrinsic-test/src/arm/types.rs | 19 +++++------- .../src/common/intrinsic_helpers.rs | 29 ++++++++++++------- crates/intrinsic-test/src/x86/types.rs | 22 +++++--------- 3 files changed, 34 insertions(+), 36 deletions(-) diff --git a/crates/intrinsic-test/src/arm/types.rs b/crates/intrinsic-test/src/arm/types.rs index e86a2c5189..c798cbe42d 100644 --- a/crates/intrinsic-test/src/arm/types.rs +++ b/crates/intrinsic-test/src/arm/types.rs @@ -112,12 +112,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { ty = self.c_single_vector_type(), lanes = (0..self.num_lanes()) .map(move |idx| -> std::string::String { + let lane_fn = self.get_lane_function(); + let final_cast = self.generate_final_type_cast(); format!( - "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", - cast = self.c_promotion(), - lane_fn = self.get_lane_function(), - lane = idx, - vector = vector, + "{final_cast}{lane_fn}(__return_value.val[{vector}], {idx})" ) }) .collect::>() @@ -129,12 +127,9 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { } else if self.num_lanes() > 1 { (0..self.num_lanes()) .map(|idx| -> std::string::String { - format!( - "{cast}{lane_fn}(__return_value, {lane})", - cast = self.c_promotion(), - lane_fn = self.get_lane_function(), - lane = idx - ) + let lane_fn = self.get_lane_function(); + let final_cast = self.generate_final_type_cast(); + format!("{final_cast}{lane_fn}(__return_value, {idx})") }) .collect::>() .join(r#" << ", " << "#) @@ -150,7 +145,7 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { TypeKind::Poly => format!("poly{}_t", self.inner_size()), ty => todo!("print_result_c - Unknown type: {:#?}", ty), }, - promote = self.c_promotion(), + promote = self.generate_final_type_cast(), ) }; diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index c0b9ed2535..aa8613206e 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -173,9 +173,9 @@ impl IntrinsicType { bit_len: Some(8), .. } => match kind { - TypeKind::Int(Sign::Signed) => "(int)", - TypeKind::Int(Sign::Unsigned) => "(unsigned int)", - TypeKind::Poly => "(unsigned int)(uint8_t)", + TypeKind::Int(Sign::Signed) => "int", + TypeKind::Int(Sign::Unsigned) => "unsigned int", + TypeKind::Poly => "uint8_t", _ => "", }, IntrinsicType { @@ -184,9 +184,9 @@ impl IntrinsicType { .. } => match bit_len { 8 => unreachable!("handled above"), - 16 => "(uint16_t)", - 32 => "(uint32_t)", - 64 => "(uint64_t)", + 16 => "uint16_t", + 32 => "uint32_t", + 64 => "uint64_t", 128 => "", _ => panic!("invalid bit_len"), }, @@ -195,16 +195,16 @@ impl IntrinsicType { bit_len: Some(bit_len), .. } => match bit_len { - 16 => "(float16_t)", - 32 => "(float)", - 64 => "(double)", + 16 => "float16_t", + 32 => "float", + 64 => "double", 128 => "", _ => panic!("invalid bit_len"), }, IntrinsicType { kind: TypeKind::Char(_), .. - } => "(char)", + } => "char", _ => "", } } @@ -391,4 +391,13 @@ pub trait IntrinsicTypeDefinition: Deref { bits = self.inner_size() ) } + + fn generate_final_type_cast(&self) -> String { + let type_data = self.c_promotion(); + if type_data.len() > 2 { + format!("({type_data})") + } else { + String::new() + } + } } diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 94600c989d..4ade0fa136 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -185,7 +185,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { .map(move |idx| -> std::string::String { format!( "{cast}{lane_fn}(__return_value.val[{vector}], {lane})", - cast = self.c_promotion(), + cast = self.generate_final_type_cast(), lane_fn = self.get_lane_function(), lane = idx, vector = vector, @@ -200,12 +200,13 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { } else if self.num_lanes() > 1 { (0..self.num_lanes()) .map(|idx| -> std::string::String { - format!( - "{cast}{lane_fn}(__return_value, {lane})", - cast = self.c_promotion(), - lane_fn = self.get_lane_function(), - lane = idx - ) + let cast_type = self.c_promotion(); + let lane_fn = self.get_lane_function(); + if cast_type.len() > 2 { + format!("({cast_type})({lane_fn}(__return_value, {idx}))") + } else { + format!("{lane_fn}(__return_value, {idx})") + } }) .collect::>() .join(r#" << ", " << "#) @@ -224,13 +225,6 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { "__m{}i", self.bit_len.expect(format!("self: {:#?}", self).as_str()) ), - // TypeKind::Float if self.results().inner_size() == 16 => "float16_t".to_string(), - // TypeKind::Int(true) if self.results().inner_size() == 64 => "long".to_string(), - // TypeKind::Int(false) if self.results().inner_size() == 64 => "unsigned long".to_string(), - // TypeKind::Int(true) if self.results().inner_size() == 32 => "int".to_string(), - // TypeKind::Int(false) if self.results().inner_size() == 32 => "unsigned int".to_string(), - // TypeKind::Int(true) if self.results().inner_size() == 16 => "short".to_string(), - // TypeKind::Int(false) if self.results().inner_size() == 16 => "unsigned short".to_string(), _ => self.c_scalar_type(), }, promote = self.c_promotion(), From 717a3ade78bd69362db0f9a66988a354be7341ff Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 8 Oct 2025 19:59:39 +0530 Subject: [PATCH 59/73] fix: update the display of uint8_t type in C++ --- crates/intrinsic-test/src/arm/config.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index 354d8f50b4..e2bc501127 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -9,6 +9,7 @@ std::ostream& operator<<(std::ostream& os, poly128_t value); #endif std::ostream& operator<<(std::ostream& os, float16_t value); +std::ostream& operator<<(std::ostream& os, uint8_t value); // T1 is the `To` type, T2 is the `From` type template T1 cast(T2 x) { @@ -44,6 +45,11 @@ std::ostream& operator<<(std::ostream& os, float16_t value) { os << ss.str(); return os; } + +std::ostream& operator<<(std::ostream& os, uint8_t value) { + os << (unsigned int) value; + return os; +} "#; // Format f16 values (and vectors containing them) in a way that is consistent with C. From 88921ac857694fde58c939de01b750dbdeb3178f Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 8 Oct 2025 22:48:37 +0530 Subject: [PATCH 60/73] Explicitly cast bits instead of allowing C++ to automatically cast the same (during typecasting) --- crates/intrinsic-test/src/x86/types.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index 4ade0fa136..f28d4d4988 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -203,7 +203,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { let cast_type = self.c_promotion(); let lane_fn = self.get_lane_function(); if cast_type.len() > 2 { - format!("({cast_type})({lane_fn}(__return_value, {idx}))") + format!("cast<{cast_type}>({lane_fn}(__return_value, {idx}))") } else { format!("{lane_fn}(__return_value, {idx})") } @@ -227,7 +227,7 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { ), _ => self.c_scalar_type(), }, - promote = self.c_promotion(), + promote = self.generate_final_type_cast(), ) }; From ab9e103804ee3227da793a0ddf7ddc4fe5b717ef Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 9 Oct 2025 21:16:44 +0530 Subject: [PATCH 61/73] feat: update cast<> function to reduce spurious cast functions (cases like integer to float or vice versa) --- crates/intrinsic-test/src/x86/config.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index bf9f066404..28c1a7d3ad 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -320,10 +320,9 @@ pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" #define _mm256_loadu_epi64_to___m256(mem_addr) _mm256_castsi256_ps(_mm256_loadu_si256((__m256i const*)(mem_addr))) #define _mm512_loadu_epi64_to___m512(mem_addr) _mm512_castsi512_ps(_mm512_loadu_si512((__m512i const*)(mem_addr))) - // T1 is the `To` type, T2 is the `From` type template T1 cast(T2 x) { - if constexpr (std::is_convertible_v) { + if constexpr ((std::is_integral_v && std::is_integral_v) || (std::is_floating_point_v && std::is_floating_point_v)) { return x; } else if constexpr (sizeof(T1) == sizeof(T2)) { T1 ret{}; From b3acb490b87de8ebc12725e2f2bef72092581c33 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 9 Oct 2025 21:19:35 +0530 Subject: [PATCH 62/73] Feat: Compile C++ testfiles using C++23 standard --- crates/intrinsic-test/src/x86/compile.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/compile.rs b/crates/intrinsic-test/src/x86/compile.rs index 27fd5d831c..60997a1278 100644 --- a/crates/intrinsic-test/src/x86/compile.rs +++ b/crates/intrinsic-test/src/x86/compile.rs @@ -34,7 +34,7 @@ pub fn build_cpp_compilation(config: &ProcessedCli) -> Option { "-mavx512vnni", "-mavx512vpopcntdq", "-ferror-limit=1000", - "-std=c++17", + "-std=c++23", ]); if !cpp_compiler.contains("clang") { From cc319934f12f42e33467c419025ddbda0d973b82 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Fri, 10 Oct 2025 02:18:08 +0530 Subject: [PATCH 63/73] Feat: allow downcasting (useful for certain cases where uint32_t needs to be cast to float16_t because the bits are stored in the lower half of the type) --- crates/intrinsic-test/src/x86/config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 28c1a7d3ad..6be3f1b133 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -324,7 +324,7 @@ pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" template T1 cast(T2 x) { if constexpr ((std::is_integral_v && std::is_integral_v) || (std::is_floating_point_v && std::is_floating_point_v)) { return x; - } else if constexpr (sizeof(T1) == sizeof(T2)) { + } else if constexpr (sizeof(T1) <= sizeof(T2)) { T1 ret{}; std::memcpy(&ret, &x, sizeof(T1)); return ret; From 400be7f3f2cf1efd306575d6c42abd96751aba45 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Fri, 10 Oct 2025 12:49:30 +0530 Subject: [PATCH 64/73] feat: explicitly casting the result of the lane function to unsigned variants for compatibility with the Rust version --- crates/intrinsic-test/src/x86/types.rs | 30 +++++++++++++------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index f28d4d4988..cdfc6bfa98 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -250,21 +250,21 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { .and_then(|(simd_len, bit_len)| Some(simd_len * bit_len)); match (self.bit_len, total_vector_bits) { - (Some(8), Some(128)) => String::from("_mm_extract_epi8"), - (Some(16), Some(128)) => String::from("_mm_extract_epi16"), - (Some(32), Some(128)) => String::from("_mm_extract_epi32"), - (Some(64), Some(128)) => String::from("_mm_extract_epi64"), - (Some(8), Some(256)) => String::from("_mm256_extract_epi8"), - (Some(16), Some(256)) => String::from("_mm256_extract_epi16"), - (Some(32), Some(256)) => String::from("_mm256_extract_epi32"), - (Some(64), Some(256)) => String::from("_mm256_extract_epi64"), - (Some(8), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi8"), - (Some(16), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi16"), - (Some(32), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi32"), - (Some(64), Some(512)) => String::from("_mm512_extract_intrinsic_test_epi64"), - (Some(8), Some(64)) => String::from("_mm64_extract_intrinsic_test_epi8"), - (Some(16), Some(64)) => String::from("_mm_extract_pi16"), - (Some(32), Some(64)) => String::from("_mm64_extract_intrinsic_test_epi32"), + (Some(8), Some(128)) => String::from("(uint8_t)_mm_extract_epi8"), + (Some(16), Some(128)) => String::from("(uint16_t)_mm_extract_epi16"), + (Some(32), Some(128)) => String::from("(uint32_t)_mm_extract_epi32"), + (Some(64), Some(128)) => String::from("(uint64_t)_mm_extract_epi64"), + (Some(8), Some(256)) => String::from("(uint8_t)_mm256_extract_epi8"), + (Some(16), Some(256)) => String::from("(uint16_t)_mm256_extract_epi16"), + (Some(32), Some(256)) => String::from("(uint32_t)_mm256_extract_epi32"), + (Some(64), Some(256)) => String::from("(uint64_t)_mm256_extract_epi64"), + (Some(8), Some(512)) => String::from("(uint8_t)_mm512_extract_intrinsic_test_epi8"), + (Some(16), Some(512)) => String::from("(uint16_t)_mm512_extract_intrinsic_test_epi16"), + (Some(32), Some(512)) => String::from("(uint32_t)_mm512_extract_intrinsic_test_epi32"), + (Some(64), Some(512)) => String::from("(uint64_t)_mm512_extract_intrinsic_test_epi64"), + (Some(8), Some(64)) => String::from("(uint8_t)_mm64_extract_intrinsic_test_epi8"), + (Some(16), Some(64)) => String::from("(uint16_t)_mm_extract_pi16"), + (Some(32), Some(64)) => String::from("(uint32_t)_mm64_extract_intrinsic_test_epi32"), _ => unreachable!( "invalid length for vector argument: {:?}, {:?}", self.bit_len, self.simd_len From 52091b500845da14994d17efa0c7c31534c71729 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sat, 11 Oct 2025 15:17:06 +0530 Subject: [PATCH 65/73] feat: updated exclusion list with more intrinsics, that can be fixed immediately --- Cargo.lock | 77 +++++++++++++++++++-- ci/run.sh | 1 + crates/intrinsic-test/Cargo.toml | 1 + crates/intrinsic-test/missing_x86.txt | 32 ++++++++- crates/intrinsic-test/src/common/cli.rs | 6 ++ crates/intrinsic-test/src/common/compare.rs | 1 - crates/intrinsic-test/src/x86/mod.rs | 13 +++- 7 files changed, 121 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 70f09adf2c..e198e14ffe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -282,6 +282,18 @@ dependencies = [ "wasi", ] +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -348,6 +360,7 @@ dependencies = [ "log", "pretty_env_logger", "quick-xml 0.37.5", + "rand 0.9.2", "rayon", "regex", "serde", @@ -473,7 +486,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" dependencies = [ "env_logger 0.8.4", "log", - "rand", + "rand 0.8.5", ] [[package]] @@ -485,6 +498,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "rand" version = "0.8.5" @@ -492,8 +511,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", ] [[package]] @@ -503,7 +532,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -512,7 +551,16 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.16", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.4", ] [[package]] @@ -703,7 +751,7 @@ dependencies = [ name = "stdarch-gen-loongarch" version = "0.1.0" dependencies = [ - "rand", + "rand 0.8.5", ] [[package]] @@ -736,7 +784,7 @@ version = "0.0.0" dependencies = [ "core_arch", "quickcheck", - "rand", + "rand 0.8.5", ] [[package]] @@ -819,6 +867,15 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "wasmparser" version = "0.235.0" @@ -1003,6 +1060,12 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" +[[package]] +name = "wit-bindgen" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" + [[package]] name = "xml-rs" version = "0.8.27" diff --git a/ci/run.sh b/ci/run.sh index a74769c56d..bd0e06687f 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -94,6 +94,7 @@ case ${TARGET} in TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt + TEST_SAMPLE_INTRINSICS_PERCENTAGE=5 export STDARCH_DISABLE_ASSERT_INSTR=1 PATH="$PATH":"$(pwd)"/c_programs export PATH diff --git a/crates/intrinsic-test/Cargo.toml b/crates/intrinsic-test/Cargo.toml index 2c0f53897e..9fb70f32f8 100644 --- a/crates/intrinsic-test/Cargo.toml +++ b/crates/intrinsic-test/Cargo.toml @@ -22,3 +22,4 @@ itertools = "0.14.0" quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] } serde-xml-rs = "0.8.0" regex = "1.11.1" +rand = "0.9.2" diff --git a/crates/intrinsic-test/missing_x86.txt b/crates/intrinsic-test/missing_x86.txt index 824d36f605..e546799740 100644 --- a/crates/intrinsic-test/missing_x86.txt +++ b/crates/intrinsic-test/missing_x86.txt @@ -871,4 +871,34 @@ _m_pxor _m_to_int _m_to_int64 _mm512_mask_floor_pd -_mm512_mask_floor_ps \ No newline at end of file +_mm512_mask_floor_ps + +# SDE ERROR: Cannot execute XGETBV with ECX != 0 +_xgetbv + +# Miscellaneous issues that can be fixed first +_kshiftli_mask16 +_kshiftli_mask32 +_kshiftli_mask64 +_kshiftli_mask8 +_kshiftri_mask16 +_kshiftri_mask32 +_kshiftri_mask64 +_kshiftri_mask8 +_mm256_castsi128_si256 +_mm256_extract_epi16 +_mm256_extract_epi8 +_mm512_castsi128_si512 +_mm512_castsi256_si512 +_mm512_conj_pch +_mm512_mask_reduce_max_pd +_mm512_mask_reduce_max_ps +_mm512_mask_reduce_min_pd +_mm512_mask_reduce_min_ps +_mm_comineq_sh +_mm_extract_epi16 +_mm_extract_epi8 +_mm_mask_cvtepi16_epi8 +_mm_mask_cvtpd_epi32 +_mm_mask_cvtpd_ps +_mm_ucomineq_sh \ No newline at end of file diff --git a/crates/intrinsic-test/src/common/cli.rs b/crates/intrinsic-test/src/common/cli.rs index beae6a4b04..461ab542ea 100644 --- a/crates/intrinsic-test/src/common/cli.rs +++ b/crates/intrinsic-test/src/common/cli.rs @@ -54,6 +54,9 @@ pub struct Cli { /// Set the sysroot for the C++ compiler #[arg(long)] pub cxx_toolchain_dir: Option, + + #[arg(long, default_value_t = 100u8)] + pub sample_percentage: u8, } pub struct ProcessedCli { @@ -65,6 +68,7 @@ pub struct ProcessedCli { pub linker: Option, pub cxx_toolchain_dir: Option, pub skip: Vec, + pub sample_percentage: u8, } impl ProcessedCli { @@ -74,6 +78,7 @@ impl ProcessedCli { let target = cli_options.target; let linker = cli_options.linker; let cxx_toolchain_dir = cli_options.cxx_toolchain_dir; + let sample_percentage = cli_options.sample_percentage; let skip = if let Some(filename) = cli_options.skip { let data = std::fs::read_to_string(&filename).expect("Failed to open file"); @@ -108,6 +113,7 @@ impl ProcessedCli { cxx_toolchain_dir, skip, filename, + sample_percentage, } } } diff --git a/crates/intrinsic-test/src/common/compare.rs b/crates/intrinsic-test/src/common/compare.rs index 1ad00839ef..89e5f965bc 100644 --- a/crates/intrinsic-test/src/common/compare.rs +++ b/crates/intrinsic-test/src/common/compare.rs @@ -14,7 +14,6 @@ pub fn compare_outputs(intrinsic_name_list: &Vec, runner: &str, target: let intrinsics = intrinsic_name_list .par_iter() .filter_map(|intrinsic_name| { - let c = runner_command(runner) .arg("intrinsic-test-programs") .arg(intrinsic_name) diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index 1eac6fb5f9..ca5748e5fb 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -12,6 +12,8 @@ use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; use intrinsic::X86IntrinsicType; use itertools::Itertools; +use rand::rng; +use rand::seq::IndexedRandom; use xml_parser::get_xml_intrinsics; pub struct X86ArchitectureTest { @@ -47,7 +49,10 @@ impl SupportedArchitectureTest for X86ArchitectureTest { let intrinsics = get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); - let mut intrinsics = intrinsics + let mut rng = rng(); + let sample_percentage: usize = cli_options.sample_percentage as usize; + + let intrinsics = intrinsics .into_iter() // Not sure how we would compare intrinsic that returns void. .filter(|i| i.results.kind() != TypeKind::Void) @@ -62,6 +67,12 @@ impl SupportedArchitectureTest for X86ArchitectureTest { .unique_by(|i| i.name.clone()) .collect::>(); + let sample_size = (intrinsics.len() * sample_percentage) / 100; + let mut intrinsics = intrinsics + .choose_multiple(&mut rng, sample_size) + .cloned() + .collect::>(); + intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); Self { intrinsics: intrinsics, From 7b80a1fd2fd0c87d362dff8cd45e9f1718cce3fb Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 15 Oct 2025 14:12:49 +0530 Subject: [PATCH 66/73] chore: remove x86-intel.xml from `stdarch-verify` crate --- crates/stdarch-verify/tests/x86-intel.rs | 2 +- crates/stdarch-verify/x86-intel.xml | 158422 -------------------- 2 files changed, 1 insertion(+), 158423 deletions(-) delete mode 100644 crates/stdarch-verify/x86-intel.xml diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index 02b6bdc768..5a98db980b 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -164,7 +164,7 @@ fn verify_all_signatures() { // Open up the network console and you'll see an xml file was downloaded // (currently called data-3.6.9.xml). That's the file we downloaded // here. - let xml = include_bytes!("../x86-intel.xml"); + let xml = include_bytes!("../../../intrinsics_data/x86-intel.xml"); let xml = &xml[..]; let data: Data = quick_xml::de::from_reader(xml).expect("failed to deserialize xml"); diff --git a/crates/stdarch-verify/x86-intel.xml b/crates/stdarch-verify/x86-intel.xml deleted file mode 100644 index 41f2119e68..0000000000 --- a/crates/stdarch-verify/x86-intel.xml +++ /dev/null @@ -1,158422 +0,0 @@ - - - - - - - - Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). - -tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) -MEM[out+31:out] := tmp[31:0] -dst[0] := tmp[32] -dst[7:1] := 0 - - - - ADX -
immintrin.h
- Arithmetic -
- - - - - - - Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry or overflow flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). - -tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) -MEM[out+63:out] := tmp[63:0] -dst[0] := tmp[64] -dst[7:1] := 0 - - - - ADX -
immintrin.h
- Arithmetic -
- - - - - Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." - a[127:0] := ShiftRows(a[127:0]) -a[127:0] := SubBytes(a[127:0]) -a[127:0] := MixColumns(a[127:0]) -dst[127:0] := a[127:0] XOR RoundKey[127:0] - - - AES -
wmmintrin.h
- Cryptography -
- - - - - Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst"." - a[127:0] := ShiftRows(a[127:0]) -a[127:0] := SubBytes(a[127:0]) -dst[127:0] := a[127:0] XOR RoundKey[127:0] - - - AES -
wmmintrin.h
- Cryptography -
- - - - - Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". - a[127:0] := InvShiftRows(a[127:0]) -a[127:0] := InvSubBytes(a[127:0]) -a[127:0] := InvMixColumns(a[127:0]) -dst[127:0] := a[127:0] XOR RoundKey[127:0] - - - AES -
wmmintrin.h
- Cryptography -
- - - - - Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the result in "dst". - a[127:0] := InvShiftRows(a[127:0]) -a[127:0] := InvSubBytes(a[127:0]) -dst[127:0] := a[127:0] XOR RoundKey[127:0] - - - AES -
wmmintrin.h
- Cryptography -
- - - - Perform the InvMixColumns transformation on "a" and store the result in "dst". - dst[127:0] := InvMixColumns(a[127:0]) - - - AES -
wmmintrin.h
- Cryptography -
- - - - - Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from "a" and an 8-bit round constant specified in "imm8", and store the result in "dst"." - X3[31:0] := a[127:96] -X2[31:0] := a[95:64] -X1[31:0] := a[63:32] -X0[31:0] := a[31:0] -RCON[31:0] := ZeroExtend32(imm8[7:0]) -dst[31:0] := SubWord(X1) -dst[63:32] := RotWord(SubWord(X1)) XOR RCON -dst[95:64] := SubWord(X3) -dst[127:96] := RotWord(SubWord(X3)) XOR RCON - - - AES -
wmmintrin.h
- Cryptography -
- - - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(a.row[m].bf16[2*k+0]) * FP32(b.row[k].bf16[2*n+0]) - tmp.fp32[n] += FP32(a.row[m].bf16[2*k+1]) * FP32(b.row[k].bf16[2*n+1]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-BF16 -
immintrin.h
- Application-Targeted -
- - - Compute dot-product of BF16 (16-bit) floating-point pairs in tiles "src0" and "src1", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(src0.row[m].bf16[2*k+0]) * FP32(src1.row[k].bf16[2*n+0]) - tmp.fp32[n] += FP32(src0.row[m].bf16[2*k+1]) * FP32(src1.row[k].bf16[2*n+1]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-BF16 -
immintrin.h
- Application-Targeted -
- - - - - - - Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "a" and "b" is interpreted as a complex number with FP16 real part and FP16 imaginary part. Calculates the imaginary part of the result. For each possible combination of (row of "a", column of "b"), it performs a set of multiplication and accumulations on all corresponding complex numbers (one from "a" and one from "b"). The imaginary part of the "a" element is multiplied with the real part of the corresponding "b" element, and the real part of the "a" element is multiplied with the imaginary part of the corresponding "b" elements. The two accumulated results are added, and then accumulated into the corresponding row and column of "dst". - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) - tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-COMPLEX -
immintrin.h
- Application-Targeted -
- - - - - - Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "a" and "b" is interpreted as a complex number with FP16 real part and FP16 imaginary part. Calculates the real part of the result. For each possible combination of (row of "a", column of "b"), it performs a set of multiplication and accumulations on all corresponding complex numbers (one from "a" and one from "b"). The real part of the "a" element is multiplied with the real part of the corresponding "b" element, and the negated imaginary part of the "a" element is multiplied with the imaginary part of the corresponding "b" elements. The two accumulated results are added, and then accumulated into the corresponding row and column of "dst". - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) - tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-COMPLEX -
immintrin.h
- Application-Targeted -
- - - Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles "src0" and "src1" is interpreted as a complex number with FP16 real part and FP16 imaginary part. This function calculates the imaginary part of the result. - - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+1]) - tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+0]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-COMPLEX -
immintrin.h
- Application-Targeted -
- - - Perform matrix multiplication of two tiles containing complex elements and accumulate the results into a packed single precision tile. Each dword element in input tiles src0 and src1 is interpreted as a complex number with FP16 real part and FP16 imaginary part. This function calculates the real part of the result. - - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+0]) - tmp.fp32[n] += FP32(-src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+1]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-COMPLEX -
immintrin.h
- Application-Targeted -
- - - - - - - Compute dot-product of FP16 (16-bit) floating-point pairs in tiles "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) - tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-FP16 -
immintrin.h
- Application-Targeted -
- - - Compute dot-product of FP16 (16-bit) floating-point pairs in tiles "src0" and "src1", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+0]) * FP32(src1.row[k].fp16[2*n+0]) - tmp.fp32[n] += FP32(src0.row[m].fp16[2*k+1]) * FP32(src1.row[k].fp16[2*n+1]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-FP16 -
immintrin.h
- Application-Targeted -
- - - - - - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". - DEFINE DPBD(c, x, y) { - tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) - tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) - tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) - tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) - - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - - - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". - DEFINE DPBD(c, x, y) { - tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0]) - tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1]) - tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2]) - tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3]) - - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - - - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding unsigned 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". - DEFINE DPBD(c, x, y) { - tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) - tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) - tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) - tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) - - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - - - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". - DEFINE DPBD(c, x, y) { - tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0]) - tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1]) - tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2]) - tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3]) - - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (a.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], a.row[m].dword[k], b.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "src0" with corresponding signed 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - DEFINE DPBD(c, x, y) { - tmp1 := SignExtend32(x.byte[0]) * SignExtend32(y.byte[0]) - tmp2 := SignExtend32(x.byte[1]) * SignExtend32(y.byte[1]) - tmp3 := SignExtend32(x.byte[2]) * SignExtend32(y.byte[2]) - tmp4 := SignExtend32(x.byte[3]) * SignExtend32(y.byte[3]) - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of signed 8-bit integers in "src0" with corresponding unsigned 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - DEFINE DPBD(c, x, y) { - tmp1 := SignExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) - tmp2 := SignExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) - tmp3 := SignExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) - tmp4 := SignExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "src0" with corresponding signed 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - DEFINE DPBD(c, x, y) { - tmp1 := ZeroExtend32(x.byte[0]) * SignExtend32(y.byte[0]) - tmp2 := ZeroExtend32(x.byte[1]) * SignExtend32(y.byte[1]) - tmp3 := ZeroExtend32(x.byte[2]) * SignExtend32(y.byte[2]) - tmp4 := ZeroExtend32(x.byte[3]) * SignExtend32(y.byte[3]) - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - Compute dot-product of bytes in tiles with a source/destination accumulator. Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "src0" with corresponding unsigned 8-bit integers in "src1", producing 4 intermediate 32-bit results. Sum these 4 results with the corresponding 32-bit integer in "dst", and store the 32-bit result back to tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - DEFINE DPBD(c, x, y) { - tmp1 := ZeroExtend32(x.byte[0]) * ZeroExtend32(y.byte[0]) - tmp2 := ZeroExtend32(x.byte[1]) * ZeroExtend32(y.byte[1]) - tmp3 := ZeroExtend32(x.byte[2]) * ZeroExtend32(y.byte[2]) - tmp4 := ZeroExtend32(x.byte[3]) * ZeroExtend32(y.byte[3]) - RETURN c + tmp1 + tmp2 + tmp3 + tmp4 -} -FOR m := 0 TO dst.rows - 1 - tmp := dst.row[m] - FOR k := 0 TO (src0.colsb / 4) - 1 - FOR n := 0 TO (dst.colsb / 4) - 1 - tmp.dword[n] := DPBD(tmp.dword[n], src0.row[m].dword[k], src1.row[k].dword[n]) - ENDFOR - ENDFOR - write_row_and_zero(dst, m, tmp, dst.colsb) -ENDFOR -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-INT8 -
immintrin.h
- Application-Targeted -
- - - - - Load tile configuration from a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If the specified pallette_id is zero, that signifies the init state for both the tile config and the tile data, and the tiles are zeroed. Any invalid configurations will result in #GP fault. - -// format of memory payload. each field is a byte. -// 0: palette -// 1: start_row -// 2-15: reserved, must be zero -// 16-17: tile0.colsb -// 18-19: tile1.colsb -// 20-21: tile2.colsb -// ... -// 30-31: tile7.colsb -// 32-47: reserved, must be zero -// 48: tile0.rows -// 49: tile1.rows -// 50: tile2.rows -// ... -// 55: tile7.rows -// 56-63: reserved, must be zero - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - - Stores the current tile configuration to a 64-byte memory location specified by "mem_addr". The tile configuration format is specified below, and includes the tile type pallette, the number of bytes per row, and the number of rows. If tiles are not configured, all zeroes will be stored to memory. - -// format of memory payload. each field is a byte. -// 0: palette -// 1: start_row -// 2-15: reserved, must be zero -// 16-17: tile0.colsb -// 18-19: tile1.colsb -// 20-21: tile2.colsb -// ... -// 30-31: tile7.colsb -// 32-47: reserved, must be zero -// 48: tile0.rows -// 49: tile1.rows -// 50: tile2.rows -// ... -// 55: tile7.rows -// 56-63: reserved, must be zero - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - - - - Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". - start := tileconfig.startRow -IF start == 0 // not restarting, zero incoming state - tilezero(dst) -FI -nbytes := dst.colsb -DO WHILE start < dst.rows - memptr := base + start * stride - write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) - start := start + 1 -OD -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - - - - Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst" using the tile configuration previously configured via "_tile_loadconfig". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly. - start := tileconfig.startRow -IF start == 0 // not restarting, zero incoming state - tilezero(dst) -FI -nbytes := dst.colsb -DO WHILE start < dst.rows - memptr := base + start * stride - write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) - start := start + 1 -OD -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - Release the tile configuration to return to the init state, which releases all storage it currently holds. - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - - - - Store the tile specified by "src" to memory specifieid by "base" address and "stride" using the tile configuration previously configured via "_tile_loadconfig". - start := tileconfig.startRow -DO WHILE start < src.rows - memptr := base + start * stride - write_memory(memptr, src.colsb, src.row[start]) - start := start + 1 -OD -zero_tileconfig_start() - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - - Zero the tile specified by "tdest". - nbytes := palette_table[tileconfig.palette_id].bytes_per_row -FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1 - FOR j := 0 TO nbytes-1 - tdest.row[i].byte[j] := 0 - ENDFOR -ENDFOR - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - start := tileconfig.startRow -IF start == 0 // not restarting, zero incoming state - tilezero(dst) -FI -nbytes := dst.colsb -DO WHILE start < dst.rows - memptr := base + start * stride - write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) - start := start + 1 -OD -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - Store the tile specified by "src" to memory specifieid by "base" address and "stride". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - start := tileconfig.startRow -DO WHILE start < src.rows - memptr := base + start * stride - write_memory(memptr, src.colsb, src.row[start]) - start := start + 1 -OD -zero_tileconfig_start() - - - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - Load tile rows from memory specifieid by "base" address and "stride" into destination tile "dst". This intrinsic provides a hint to the implementation that the data will likely not be reused in the near future and the data caching can be optimized accordingly. The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - start := tileconfig.startRow -IF start == 0 // not restarting, zero incoming state - tilezero(dst) -FI -nbytes := dst.colsb -DO WHILE start < dst.rows - memptr := base + start * stride - write_row_and_zero(dst, start, read_memory(memptr, nbytes), nbytes) - start := start + 1 -OD -zero_upper_rows(dst, dst.rows) -zero_tileconfig_start() - - - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - Zero the tile specified by "dst". The shape of tile is specified in the struct of __tile1024i. The register of the tile is allocated by compiler. - - nbytes := palette_table[tileconfig.palette_id].bytes_per_row -FOR i := 0 TO palette_table[tileconfig.palette_id].max_rows-1 - FOR j := 0 TO nbytes-1 - tdest.row[i].byte[j] := 0 - ENDFOR -ENDFOR - - - AMX-TILE -
immintrin.h
- Application-Targeted -
- - - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ACOS(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ACOS(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ACOSH(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ACOSH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ASIN(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ASIN(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ASINH(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ASINH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ATAN(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ATAN(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ATANH(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ATANH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := COSD(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := COSD(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := COSH(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := COSH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SIND(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SIND(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SINH(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SINH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TAN(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TAN(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TAND(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TAND(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TANH(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TANH(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Trigonometry -
- - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CEXP(a[31:0], b[31:0]) { - result[31:0] := POW(FP32(e), a[31:0]) * COS(b[31:0]) - result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0]) - RETURN result -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CLOG(a[31:0], b[31:0]) { - result[31:0] := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0))) - result[63:32] := ATAN2(b, a) - RETURN result -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CSQRT(a[31:0], b[31:0]) { - sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) - result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - RETURN result -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(10.0, a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := InvCubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := InvCubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := InvSQRT(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := InvSQRT(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(1.0 + a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(1.0 + a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ERF(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ERF(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*32 - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Probability/Statistics -
- - - - - Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 31 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 7 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 3 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 31 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 7 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 3 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - - Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 31 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 15 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 3 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 31 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 15 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 3 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Arithmetic -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Special Math Functions -
- - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := TRUNCATE(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Miscellaneous -
- - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := TRUNCATE(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Miscellaneous -
- - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] + b[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] + b[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - dst[i+63:i] := a[i+63:i] / b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := a[i+31:i] / b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - - Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - -DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { - FOR j := 0 to 3 - i := j*32 - IF imm8[(4+j)%8] - temp[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - temp[i+31:i] := FP32(0.0) - FI - ENDFOR - - sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) - - FOR j := 0 to 3 - i := j*32 - IF imm8[j%8] - tmpdst[i+31:i] := sum[31:0] - ELSE - tmpdst[i+31:i] := FP32(0.0) - FI - ENDFOR - RETURN tmpdst[127:0] -} -dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) -dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[63:0] := a[127:64] + a[63:0] -dst[127:64] := b[127:64] + b[63:0] -dst[191:128] := a[255:192] + a[191:128] -dst[255:192] := b[255:192] + b[191:128] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[31:0] := a[63:32] + a[31:0] -dst[63:32] := a[127:96] + a[95:64] -dst[95:64] := b[63:32] + b[31:0] -dst[127:96] := b[127:96] + b[95:64] -dst[159:128] := a[191:160] + a[159:128] -dst[191:160] := a[255:224] + a[223:192] -dst[223:192] := b[191:160] + b[159:128] -dst[255:224] := b[255:224] + b[223:192] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[63:0] := a[63:0] - a[127:64] -dst[127:64] := b[63:0] - b[127:64] -dst[191:128] := a[191:128] - a[255:192] -dst[255:192] := b[191:128] - b[255:192] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[31:0] := a[31:0] - a[63:32] -dst[63:32] := a[95:64] - a[127:96] -dst[95:64] := b[31:0] - b[63:32] -dst[127:96] := b[95:64] - b[127:96] -dst[159:128] := a[159:128] - a[191:160] -dst[191:160] := a[223:192] - a[255:224] -dst[223:192] := b[159:128] - b[191:160] -dst[255:224] := b[223:192] - b[255:224] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] * b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Arithmetic -
- - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. - -IF ((a[255:0] AND b[255:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[255:0]) AND b[255:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -RETURN ZF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. - -IF ((a[255:0] AND b[255:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[255:0]) AND b[255:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -RETURN CF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -IF ((a[255:0] AND b[255:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[255:0]) AND b[255:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - -tmp[255:0] := a[255:0] AND b[255:0] -IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[255:0] := (NOT a[255:0]) AND b[255:0] -IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := ZF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - -tmp[255:0] := a[255:0] AND b[255:0] -IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[255:0] := (NOT a[255:0]) AND b[255:0] -IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := CF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -tmp[255:0] := a[255:0] AND b[255:0] -IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[255:0] := (NOT a[255:0]) AND b[255:0] -IF (tmp[63] == 0 && tmp[127] == 0 && tmp[191] == 0 && tmp[255] == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - -tmp[127:0] := a[127:0] AND b[127:0] -IF (tmp[63] == 0 && tmp[127] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[127:0] := (NOT a[127:0]) AND b[127:0] -IF (tmp[63] == 0 && tmp[127] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := ZF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - -tmp[127:0] := a[127:0] AND b[127:0] -IF (tmp[63] == 0 && tmp[127] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[127:0] := (NOT a[127:0]) AND b[127:0] -IF (tmp[63] == 0 && tmp[127] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := CF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -tmp[127:0] := a[127:0] AND b[127:0] -IF (tmp[63] == 0 && tmp[127] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[127:0] := (NOT a[127:0]) AND b[127:0] -IF (tmp[63] == 0 && tmp[127] == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - -tmp[255:0] := a[255:0] AND b[255:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ - tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[255:0] := (NOT a[255:0]) AND b[255:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ - tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := ZF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - -tmp[255:0] := a[255:0] AND b[255:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ - tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[255:0] := (NOT a[255:0]) AND b[255:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ - tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := CF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 256-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -tmp[255:0] := a[255:0] AND b[255:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ - tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[255:0] := (NOT a[255:0]) AND b[255:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0 && \ - tmp[159] == 0 && tmp[191] == 0 && tmp[223] == 0 && tmp[255] == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "ZF" value. - -tmp[127:0] := a[127:0] AND b[127:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[127:0] := (NOT a[127:0]) AND b[127:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := ZF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return the "CF" value. - -tmp[127:0] := a[127:0] AND b[127:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[127:0] := (NOT a[127:0]) AND b[127:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := CF - - - AVX -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in "a" and "b", producing an intermediate 128-bit value, and set "ZF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", producing an intermediate value, and set "CF" to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -tmp[127:0] := a[127:0] AND b[127:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) - ZF := 1 -ELSE - ZF := 0 -FI -tmp[127:0] := (NOT a[127:0]) AND b[127:0] -IF (tmp[31] == 0 && tmp[63] == 0 && tmp[95] == 0 && tmp[127] == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - AVX -
immintrin.h
- Logical -
- - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF imm8[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF imm8[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". - -dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Extract a 32-bit integer from "a", selected with "index", and store the result in "dst". - -dst[31:0] := (a[255:0] >> (index[2:0] * 32))[31:0] - - AVX -
immintrin.h
- Swizzle -
- - - - - Extract a 64-bit integer from "a", selected with "index", and store the result in "dst". - -dst[63:0] := (a[255:0] >> (index[1:0] * 64))[63:0] - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], b[1:0]) -dst[63:32] := SELECT4(a[127:0], b[33:32]) -dst[95:64] := SELECT4(a[127:0], b[65:64]) -dst[127:96] := SELECT4(a[127:0], b[97:96]) -dst[159:128] := SELECT4(a[255:128], b[129:128]) -dst[191:160] := SELECT4(a[255:128], b[161:160]) -dst[223:192] := SELECT4(a[255:128], b[193:192]) -dst[255:224] := SELECT4(a[255:128], b[225:224]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], b[1:0]) -dst[63:32] := SELECT4(a[127:0], b[33:32]) -dst[95:64] := SELECT4(a[127:0], b[65:64]) -dst[127:96] := SELECT4(a[127:0], b[97:96]) -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - -IF (b[1] == 0) dst[63:0] := a[63:0]; FI -IF (b[1] == 1) dst[63:0] := a[127:64]; FI -IF (b[65] == 0) dst[127:64] := a[63:0]; FI -IF (b[65] == 1) dst[127:64] := a[127:64]; FI -IF (b[129] == 0) dst[191:128] := a[191:128]; FI -IF (b[129] == 1) dst[191:128] := a[255:192]; FI -IF (b[193] == 0) dst[255:192] := a[191:128]; FI -IF (b[193] == 1) dst[255:192] := a[255:192]; FI -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst". - -IF (b[1] == 0) dst[63:0] := a[63:0]; FI -IF (b[1] == 1) dst[63:0] := a[127:64]; FI -IF (b[65] == 0) dst[127:64] := a[63:0]; FI -IF (b[65] == 1) dst[127:64] := a[127:64]; FI -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - -IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE imm8[0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 128 bits from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 8-bit integer "i" into "dst" at the location specified by "index". - -dst[255:0] := a[255:0] -sel := index[4:0]*8 -dst[sel+7:sel] := i[7:0] - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "index". - -dst[255:0] := a[255:0] -sel := index[3:0]*16 -dst[sel+15:sel] := i[15:0] - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "index". - -dst[255:0] := a[255:0] -sel := index[2:0]*32 -dst[sel+31:sel] := i[31:0] - - AVX -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "index". - -dst[255:0] := a[255:0] -sel := index[1:0]*64 -dst[sel+63:sel] := i[63:0] - - AVX -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Swizzle -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - - Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". - [round_note] - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i], rounding) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - - Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i], rounding) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Compare -
- - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Compare -
- - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Compare -
- - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Convert -
- - - - Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". - -dst[31:0] := a[31:0] - - - AVX -
immintrin.h
- Convert -
- - - - Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". - -dst[63:0] := a[63:0] - - - AVX -
immintrin.h
- Convert -
- - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - - AVX -
immintrin.h
- Convert -
- - - - Zero the contents of all XMM or YMM registers. - YMM0[MAX:0] := 0 -YMM1[MAX:0] := 0 -YMM2[MAX:0] := 0 -YMM3[MAX:0] := 0 -YMM4[MAX:0] := 0 -YMM5[MAX:0] := 0 -YMM6[MAX:0] := 0 -YMM7[MAX:0] := 0 -IF _64_BIT_MODE - YMM8[MAX:0] := 0 - YMM9[MAX:0] := 0 - YMM10[MAX:0] := 0 - YMM11[MAX:0] := 0 - YMM12[MAX:0] := 0 - YMM13[MAX:0] := 0 - YMM14[MAX:0] := 0 - YMM15[MAX:0] := 0 -FI - - - AVX -
immintrin.h
- General Support -
- - - - Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified. - YMM0[MAX:128] := 0 -YMM1[MAX:128] := 0 -YMM2[MAX:128] := 0 -YMM3[MAX:128] := 0 -YMM4[MAX:128] := 0 -YMM5[MAX:128] := 0 -YMM6[MAX:128] := 0 -YMM7[MAX:128] := 0 -IF _64_BIT_MODE - YMM8[MAX:128] := 0 - YMM9[MAX:128] := 0 - YMM10[MAX:128] := 0 - YMM11[MAX:128] := 0 - YMM12[MAX:128] := 0 - YMM13[MAX:128] := 0 - YMM14[MAX:128] := 0 - YMM15[MAX:128] := 0 -FI - - - AVX -
immintrin.h
- General Support -
- - - - Return vector of type __m256 with undefined elements. - AVX -
immintrin.h
- General Support -
- - - - Return vector of type __m256d with undefined elements. - AVX -
immintrin.h
- General Support -
- - - - Return vector of type __m256i with undefined elements. - AVX -
immintrin.h
- General Support -
- - - - Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". - -tmp[31:0] := MEM[mem_addr+31:mem_addr] -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := tmp[31:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - Swizzle - - - Broadcast a single-precision (32-bit) floating-point element from memory to all elements of "dst". - -tmp[31:0] := MEM[mem_addr+31:mem_addr] -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := tmp[31:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Load -
- - Swizzle - - - Broadcast a double-precision (64-bit) floating-point element from memory to all elements of "dst". - -tmp[63:0] := MEM[mem_addr+63:mem_addr] -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := tmp[63:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - Swizzle - - - Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of "dst". - -tmp[127:0] := MEM[mem_addr+127:mem_addr] -dst[127:0] := tmp[127:0] -dst[255:128] := tmp[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - Swizzle - - - Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of "dst". - -tmp[127:0] := MEM[mem_addr+127:mem_addr] -dst[127:0] := tmp[127:0] -dst[255:128] := tmp[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits of integer data from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits of integer data from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using "mask" (elements are zeroed out when the high bit of the corresponding element is not set). - -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX -
immintrin.h
- Load -
- - - - Load 256-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm256_loadu_si256" when the data crosses a cache line boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Load -
- - - - - Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -dst[127:0] := MEM[loaddr+127:loaddr] -dst[255:128] := MEM[hiaddr+127:hiaddr] -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Load -
- - - - - Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in "dst". - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -dst[127:0] := MEM[loaddr+127:loaddr] -dst[255:128] := MEM[hiaddr+127:hiaddr] -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Load -
- - - - - Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in "dst". - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -dst[127:0] := MEM[loaddr+127:loaddr] -dst[255:128] := MEM[hiaddr+127:hiaddr] -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Load -
- - - - - Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits of integer data from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits of integer data from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". - -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using "mask". - -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". - -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using "mask". - -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits of integer data from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX -
immintrin.h
- Store -
- - - - - - Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory two different 128-bit locations. - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -MEM[loaddr+127:loaddr] := a[127:0] -MEM[hiaddr+127:hiaddr] := a[255:128] - - AVX -
immintrin.h
- Store -
- - - - - - Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory two different 128-bit locations. - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -MEM[loaddr+127:loaddr] := a[127:0] -MEM[hiaddr+127:hiaddr] := a[255:128] - - AVX -
immintrin.h
- Store -
- - - - - - Store the high and low 128-bit halves (each composed of integer data) from "a" into memory two different 128-bit locations. - "hiaddr" and "loaddr" do not need to be aligned on any particular boundary. - -MEM[loaddr+127:loaddr] := a[127:0] -MEM[hiaddr+127:hiaddr] := a[255:128] - - AVX -
immintrin.h
- Store -
- - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := a[63:32] -dst[95:64] := a[127:96] -dst[127:96] := a[127:96] -dst[159:128] := a[191:160] -dst[191:160] := a[191:160] -dst[223:192] := a[255:224] -dst[255:224] := a[255:224] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Move -
- - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := a[31:0] -dst[95:64] := a[95:64] -dst[127:96] := a[95:64] -dst[159:128] := a[159:128] -dst[191:160] := a[159:128] -dst[223:192] := a[223:192] -dst[255:224] := a[223:192] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Move -
- - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". - -dst[63:0] := a[63:0] -dst[127:64] := a[63:0] -dst[191:128] := a[191:128] -dst[255:192] := a[191:128] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Move -
- - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := 1.0 / a[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Elementary Math Functions -
- - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". - -FOR j := 0 to 3 - i := j*64 - IF a[i+63] - dst[j] := 1 - ELSE - dst[j] := 0 - FI -ENDFOR -dst[MAX:4] := 0 - - - AVX -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". - -FOR j := 0 to 7 - i := j*32 - IF a[i+31] - dst[j] := 1 - ELSE - dst[j] := 0 - FI -ENDFOR -dst[MAX:8] := 0 - - - AVX -
immintrin.h
- Miscellaneous -
- - - - Return vector of type __m256d with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX -
immintrin.h
- Set -
- - - - Return vector of type __m256 with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX -
immintrin.h
- Set -
- - - - Return vector of type __m256i with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX -
immintrin.h
- Set -
- - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. - -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 -dst[71:64] := e8 -dst[79:72] := e9 -dst[87:80] := e10 -dst[95:88] := e11 -dst[103:96] := e12 -dst[111:104] := e13 -dst[119:112] := e14 -dst[127:120] := e15 -dst[135:128] := e16 -dst[143:136] := e17 -dst[151:144] := e18 -dst[159:152] := e19 -dst[167:160] := e20 -dst[175:168] := e21 -dst[183:176] := e22 -dst[191:184] := e23 -dst[199:192] := e24 -dst[207:200] := e25 -dst[215:208] := e26 -dst[223:216] := e27 -dst[231:224] := e28 -dst[239:232] := e29 -dst[247:240] := e30 -dst[255:248] := e31 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values. - -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -dst[79:64] := e4 -dst[95:80] := e5 -dst[111:96] := e6 -dst[127:112] := e7 -dst[143:128] := e8 -dst[159:144] := e9 -dst[175:160] := e10 -dst[191:176] := e11 -dst[207:192] := e12 -dst[223:208] := e13 -dst[239:224] := e14 -dst[255:240] := e15 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - Set packed 64-bit integers in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[63:0] := e3 -dst[127:64] := e2 -dst[191:128] := e1 -dst[255:192] := e0 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[31:0] := e7 -dst[63:32] := e6 -dst[95:64] := e5 -dst[127:96] := e4 -dst[159:128] := e3 -dst[191:160] := e2 -dst[223:192] := e1 -dst[255:224] := e0 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values in reverse order. - -dst[7:0] := e31 -dst[15:8] := e30 -dst[23:16] := e29 -dst[31:24] := e28 -dst[39:32] := e27 -dst[47:40] := e26 -dst[55:48] := e25 -dst[63:56] := e24 -dst[71:64] := e23 -dst[79:72] := e22 -dst[87:80] := e21 -dst[95:88] := e20 -dst[103:96] := e19 -dst[111:104] := e18 -dst[119:112] := e17 -dst[127:120] := e16 -dst[135:128] := e15 -dst[143:136] := e14 -dst[151:144] := e13 -dst[159:152] := e12 -dst[167:160] := e11 -dst[175:168] := e10 -dst[183:176] := e9 -dst[191:184] := e8 -dst[199:192] := e7 -dst[207:200] := e6 -dst[215:208] := e5 -dst[223:216] := e4 -dst[231:224] := e3 -dst[239:232] := e2 -dst[247:240] := e1 -dst[255:248] := e0 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values in reverse order. - -dst[15:0] := e15 -dst[31:16] := e14 -dst[47:32] := e13 -dst[63:48] := e12 -dst[79:64] := e11 -dst[95:80] := e10 -dst[111:96] := e9 -dst[127:112] := e8 -dst[143:128] := e7 -dst[159:144] := e6 -dst[175:160] := e5 -dst[191:176] := e4 -dst[207:192] := e3 -dst[223:208] := e2 -dst[239:224] := e1 -dst[255:240] := e0 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. - -dst[31:0] := e7 -dst[63:32] := e6 -dst[95:64] := e5 -dst[127:96] := e4 -dst[159:128] := e3 -dst[191:160] := e2 -dst[223:192] := e1 -dst[255:224] := e0 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - - - Set packed 64-bit integers in "dst" with the supplied values in reverse order. - -dst[63:0] := e3 -dst[127:64] := e2 -dst[191:128] := e1 -dst[255:192] := e0 -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastb". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate the "vpbroadcastw". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastd". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:256] := 0 - - AVX -
immintrin.h
- Set -
- - - - - Set packed __m256 vector "dst" with the supplied values. - -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Set -
- - - - - Set packed __m256d vector "dst" with the supplied values. - -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Set -
- - - - - Set packed __m256i vector "dst" with the supplied values. - -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Set -
- - - - - Set packed __m256 vector "dst" with the supplied values. - -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Set -
- - - - - Set packed __m256d vector "dst" with the supplied values. - -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Set -
- - - - - Set packed __m256i vector "dst" with the supplied values. - -dst[127:0] := lo[127:0] -dst[255:128] := hi[127:0] -dst[MAX:256] := 0 - - - AVX -
immintrin.h
- Set -
- - - - Cast vector of type __m256d to type __m256. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256 to type __m256d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m128 to type __m256; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m128d to type __m256d; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - Cast vector of type __m128i to type __m256i; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX -
immintrin.h
- Cast -
- - - - - - - Extract an 8-bit integer from "a", selected with "index", and store the result in "dst". - -dst[7:0] := (a[255:0] >> (index[4:0] * 8))[7:0] - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Extract a 16-bit integer from "a", selected with "index", and store the result in "dst". - -dst[15:0] := (a[255:0] >> (index[3:0] * 16))[15:0] - - AVX2 -
immintrin.h
- Swizzle -
- - - - - - Blend packed 16-bit integers from "a" and "b" within 128-bit lanes using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF imm8[j%8] - dst[i+15:i] := b[i+15:i] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF imm8[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF imm8[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - - Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - IF mask[i+7] - dst[i+7:i] := b[i+7:i] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". - -dst[127:0] := a[127:0] -dst[255:128] := a[127:0] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast 128 bits of integer data from "a" to all 128-bit lanes in "dst". - -dst[127:0] := a[127:0] -dst[255:128] := a[127:0] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Extract 128 bits (composed of integer data) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of integer data) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of integer data) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src1, src2, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src1[127:0] - 1: tmp[127:0] := src1[255:128] - 2: tmp[127:0] := src2[127:0] - 3: tmp[127:0] := src2[255:128] - ESAC - IF control[3] - tmp[127:0] := 0 - FI - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0]) -dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle 8-bit integers in "a" within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI - IF b[128+i+7] == 1 - dst[128+i+7:128+i] := 0 - ELSE - index[3:0] := b[128+i+3:128+i] - dst[128+i+7:128+i] := a[128+index*8+7:128+index*8] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". - -dst[63:0] := a[63:0] -dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -dst[191:128] := a[191:128] -dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". - -dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -dst[127:64] := a[127:64] -dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -dst[255:192] := a[255:192] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Swizzle -
- - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := ABS(a[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ABS(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ABS(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Special Math Functions -
- - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - -dst[15:0] := a[31:16] + a[15:0] -dst[31:16] := a[63:48] + a[47:32] -dst[47:32] := a[95:80] + a[79:64] -dst[63:48] := a[127:112] + a[111:96] -dst[79:64] := b[31:16] + b[15:0] -dst[95:80] := b[63:48] + b[47:32] -dst[111:96] := b[95:80] + b[79:64] -dst[127:112] := b[127:112] + b[111:96] -dst[143:128] := a[159:144] + a[143:128] -dst[159:144] := a[191:176] + a[175:160] -dst[175:160] := a[223:208] + a[207:192] -dst[191:176] := a[255:240] + a[239:224] -dst[207:192] := b[159:144] + b[143:128] -dst[223:208] := b[191:176] + b[175:160] -dst[239:224] := b[223:208] + b[207:192] -dst[255:240] := b[255:240] + b[239:224] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - -dst[31:0] := a[63:32] + a[31:0] -dst[63:32] := a[127:96] + a[95:64] -dst[95:64] := b[63:32] + b[31:0] -dst[127:96] := b[127:96] + b[95:64] -dst[159:128] := a[191:160] + a[159:128] -dst[191:160] := a[255:224] + a[223:192] -dst[223:192] := b[191:160] + b[159:128] -dst[255:224] := b[255:224] + b[223:192] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - -dst[15:0] := Saturate16(a[31:16] + a[15:0]) -dst[31:16] := Saturate16(a[63:48] + a[47:32]) -dst[47:32] := Saturate16(a[95:80] + a[79:64]) -dst[63:48] := Saturate16(a[127:112] + a[111:96]) -dst[79:64] := Saturate16(b[31:16] + b[15:0]) -dst[95:80] := Saturate16(b[63:48] + b[47:32]) -dst[111:96] := Saturate16(b[95:80] + b[79:64]) -dst[127:112] := Saturate16(b[127:112] + b[111:96]) -dst[143:128] := Saturate16(a[159:144] + a[143:128]) -dst[159:144] := Saturate16(a[191:176] + a[175:160]) -dst[175:160] := Saturate16(a[223:208] + a[207:192]) -dst[191:176] := Saturate16(a[255:240] + a[239:224]) -dst[207:192] := Saturate16(b[159:144] + b[143:128]) -dst[223:208] := Saturate16(b[191:176] + b[175:160]) -dst[239:224] := Saturate16(b[223:208] + b[207:192]) -dst[255:240] := Saturate16(b[255:240] + b[239:224]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - -dst[15:0] := a[15:0] - a[31:16] -dst[31:16] := a[47:32] - a[63:48] -dst[47:32] := a[79:64] - a[95:80] -dst[63:48] := a[111:96] - a[127:112] -dst[79:64] := b[15:0] - b[31:16] -dst[95:80] := b[47:32] - b[63:48] -dst[111:96] := b[79:64] - b[95:80] -dst[127:112] := b[111:96] - b[127:112] -dst[143:128] := a[143:128] - a[159:144] -dst[159:144] := a[175:160] - a[191:176] -dst[175:160] := a[207:192] - a[223:208] -dst[191:176] := a[239:224] - a[255:240] -dst[207:192] := b[143:128] - b[159:144] -dst[223:208] := b[175:160] - b[191:176] -dst[239:224] := b[207:192] - b[223:208] -dst[255:240] := b[239:224] - b[255:240] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - -dst[31:0] := a[31:0] - a[63:32] -dst[63:32] := a[95:64] - a[127:96] -dst[95:64] := b[31:0] - b[63:32] -dst[127:96] := b[95:64] - b[127:96] -dst[159:128] := a[159:128] - a[191:160] -dst[191:160] := a[223:192] - a[255:224] -dst[223:192] := b[159:128] - b[191:160] -dst[255:224] := b[223:192] - b[255:224] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - -dst[15:0] := Saturate16(a[15:0] - a[31:16]) -dst[31:16] := Saturate16(a[47:32] - a[63:48]) -dst[47:32] := Saturate16(a[79:64] - a[95:80]) -dst[63:48] := Saturate16(a[111:96] - a[127:112]) -dst[79:64] := Saturate16(b[15:0] - b[31:16]) -dst[95:80] := Saturate16(b[47:32] - b[63:48]) -dst[111:96] := Saturate16(b[79:64] - b[95:80]) -dst[127:112] := Saturate16(b[111:96] - b[127:112]) -dst[143:128] := Saturate16(a[143:128] - a[159:144]) -dst[159:144] := Saturate16(a[175:160] - a[191:176]) -dst[175:160] := Saturate16(a[207:192] - a[223:208]) -dst[191:176] := Saturate16(a[239:224] - a[255:240]) -dst[207:192] := Saturate16(b[143:128] - b[159:144]) -dst[223:208] := Saturate16(b[175:160] - b[191:176]) -dst[239:224] := Saturate16(b[207:192] - b[223:208]) -dst[255:240] := Saturate16(b[239:224] - b[255:240]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+31:i] * b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*32 - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". - -FOR j := 0 to 31 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -FOR j := 0 to 3 - i := j*64 - dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ - tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] - dst[i+63:i+16] := 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Negate packed signed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 31 - i := j*8 - IF b[i+7:i] < 0 - dst[i+7:i] := -(a[i+7:i]) - ELSE IF b[i+7:i] == 0 - dst[i+7:i] := 0 - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Negate packed signed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 15 - i := j*16 - IF b[i+15:i] < 0 - dst[i+15:i] := -(a[i+15:i]) - ELSE IF b[i+15:i] == 0 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Negate packed signed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 7 - i := j*32 - IF b[i+31:i] < 0 - dst[i+31:i] := -(a[i+31:i]) - ELSE IF b[i+31:i] == 0 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Arithmetic -
- - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - -FOR j := 0 to 1 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - dst[i+127:i] := tmp[127:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[j] := a[i+7] -ENDFOR - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Eight SADs are performed for each 128-bit lane using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". - -DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { - a_offset := imm8[2]*32 - b_offset := imm8[1:0]*32 - FOR j := 0 to 7 - i := j*8 - k := a_offset+i - l := b_offset - tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ - ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) - ENDFOR - RETURN tmp[127:0] -} -dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) -dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(a[79:64]) -dst[47:40] := Saturate8(a[95:80]) -dst[55:48] := Saturate8(a[111:96]) -dst[63:56] := Saturate8(a[127:112]) -dst[71:64] := Saturate8(b[15:0]) -dst[79:72] := Saturate8(b[31:16]) -dst[87:80] := Saturate8(b[47:32]) -dst[95:88] := Saturate8(b[63:48]) -dst[103:96] := Saturate8(b[79:64]) -dst[111:104] := Saturate8(b[95:80]) -dst[119:112] := Saturate8(b[111:96]) -dst[127:120] := Saturate8(b[127:112]) -dst[135:128] := Saturate8(a[143:128]) -dst[143:136] := Saturate8(a[159:144]) -dst[151:144] := Saturate8(a[175:160]) -dst[159:152] := Saturate8(a[191:176]) -dst[167:160] := Saturate8(a[207:192]) -dst[175:168] := Saturate8(a[223:208]) -dst[183:176] := Saturate8(a[239:224]) -dst[191:184] := Saturate8(a[255:240]) -dst[199:192] := Saturate8(b[143:128]) -dst[207:200] := Saturate8(b[159:144]) -dst[215:208] := Saturate8(b[175:160]) -dst[223:216] := Saturate8(b[191:176]) -dst[231:224] := Saturate8(b[207:192]) -dst[239:232] := Saturate8(b[223:208]) -dst[247:240] := Saturate8(b[239:224]) -dst[255:248] := Saturate8(b[255:240]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(a[95:64]) -dst[63:48] := Saturate16(a[127:96]) -dst[79:64] := Saturate16(b[31:0]) -dst[95:80] := Saturate16(b[63:32]) -dst[111:96] := Saturate16(b[95:64]) -dst[127:112] := Saturate16(b[127:96]) -dst[143:128] := Saturate16(a[159:128]) -dst[159:144] := Saturate16(a[191:160]) -dst[175:160] := Saturate16(a[223:192]) -dst[191:176] := Saturate16(a[255:224]) -dst[207:192] := Saturate16(b[159:128]) -dst[223:208] := Saturate16(b[191:160]) -dst[239:224] := Saturate16(b[223:192]) -dst[255:240] := Saturate16(b[255:224]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(a[79:64]) -dst[47:40] := SaturateU8(a[95:80]) -dst[55:48] := SaturateU8(a[111:96]) -dst[63:56] := SaturateU8(a[127:112]) -dst[71:64] := SaturateU8(b[15:0]) -dst[79:72] := SaturateU8(b[31:16]) -dst[87:80] := SaturateU8(b[47:32]) -dst[95:88] := SaturateU8(b[63:48]) -dst[103:96] := SaturateU8(b[79:64]) -dst[111:104] := SaturateU8(b[95:80]) -dst[119:112] := SaturateU8(b[111:96]) -dst[127:120] := SaturateU8(b[127:112]) -dst[135:128] := SaturateU8(a[143:128]) -dst[143:136] := SaturateU8(a[159:144]) -dst[151:144] := SaturateU8(a[175:160]) -dst[159:152] := SaturateU8(a[191:176]) -dst[167:160] := SaturateU8(a[207:192]) -dst[175:168] := SaturateU8(a[223:208]) -dst[183:176] := SaturateU8(a[239:224]) -dst[191:184] := SaturateU8(a[255:240]) -dst[199:192] := SaturateU8(b[143:128]) -dst[207:200] := SaturateU8(b[159:144]) -dst[215:208] := SaturateU8(b[175:160]) -dst[223:216] := SaturateU8(b[191:176]) -dst[231:224] := SaturateU8(b[207:192]) -dst[239:232] := SaturateU8(b[223:208]) -dst[247:240] := SaturateU8(b[239:224]) -dst[255:248] := SaturateU8(b[255:240]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". - -dst[15:0] := SaturateU16(a[31:0]) -dst[31:16] := SaturateU16(a[63:32]) -dst[47:32] := SaturateU16(a[95:64]) -dst[63:48] := SaturateU16(a[127:96]) -dst[79:64] := SaturateU16(b[31:0]) -dst[95:80] := SaturateU16(b[63:32]) -dst[111:96] := SaturateU16(b[95:64]) -dst[127:112] := SaturateU16(b[127:96]) -dst[143:128] := SaturateU16(a[159:128]) -dst[159:144] := SaturateU16(a[191:160]) -dst[175:160] := SaturateU16(a[223:192]) -dst[191:176] := SaturateU16(a[255:224]) -dst[207:192] := SaturateU16(b[159:128]) -dst[223:208] := SaturateU16(b[191:160]) -dst[239:224] := SaturateU16(b[223:192]) -dst[255:240] := SaturateU16(b[255:224]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Miscellaneous -
- - - - - Compute the bitwise AND of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[255:0] := (a[255:0] AND b[255:0]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of 256 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[255:0] := ((NOT a[255:0]) AND b[255:0]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[255:0] := (a[255:0] OR b[255:0]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of 256 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[255:0] := (a[255:0] XOR b[255:0]) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Logical -
- - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Probability/Statistics -
- - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Probability/Statistics -
- - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Compare -
- - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j:= 0 to 7 - i := 32*j - k := 16*j - dst[i+31:i] := SignExtend32(a[k+15:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j:= 0 to 3 - i := 64*j - k := 16*j - dst[i+63:i] := SignExtend64(a[k+15:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j:= 0 to 3 - i := 64*j - k := 32*j - dst[i+63:i] := SignExtend64(a[k+31:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - l := j*16 - dst[l+15:l] := SignExtend16(a[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[i+31:i] := SignExtend32(a[k+7:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[i+63:i] := SignExtend64(a[k+7:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[i+31:i] := ZeroExtend32(a[k+15:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j:= 0 to 3 - i := 64*j - k := 16*j - dst[i+63:i] := ZeroExtend64(a[k+15:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j:= 0 to 3 - i := 64*j - k := 32*j - dst[i+63:i] := ZeroExtend64(a[k+31:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - l := j*16 - dst[l+15:l] := ZeroExtend16(a[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[i+31:i] := ZeroExtend32(a[k+7:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[i+63:i] := ZeroExtend64(a[k+7:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Convert -
- - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:64] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:64] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:256] := 0 -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:256] := 0 -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF mask[i+31] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:256] := 0 -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF mask[i+63] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:256] := 0 -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:256] := 0 -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:64] := 0 -dst[MAX:64] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:64] := 0 -dst[MAX:64] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF mask[i+31] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:128] := 0 -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using "mask" (elements are copied from "src" when the highest bit is not set in the corresponding element). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF mask[i+63] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -mask[MAX:256] := 0 -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using "mask" (elements are zeroed out when the highest bit is not set in the corresponding element). - -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - Load 256-bits of integer data from memory into "dst" using a non-temporal memory hint. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Load -
- - - - - - Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX2 -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - -FOR j := 0 to 7 - i := j*32 - IF mask[i+31] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX2 -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX2 -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - -FOR j := 0 to 3 - i := j*64 - IF mask[i+63] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX2 -
immintrin.h
- Store -
- - - - - Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) -dst[255:128] := a[255:128] << (tmp*8) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) -dst[255:128] := a[255:128] << (tmp*8) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) -dst[255:128] := a[255:128] >> (tmp*8) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) -dst[255:128] := a[255:128] >> (tmp*8) -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX2 -
immintrin.h
- Shift -
- - - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 1 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 3 - i := j*64 - dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 1 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 3 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 1 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 3 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -tmp.dword[0] := b.dword[ imm8[1:0] ] -tmp.dword[1] := b.dword[ imm8[3:2] ] -tmp.dword[2] := b.dword[ imm8[5:4] ] -tmp.dword[3] := b.dword[ imm8[7:6] ] -FOR j := 0 to 1 - i := j*64 - dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -tmp.dword[0] := b.dword[ imm8[1:0] ] -tmp.dword[1] := b.dword[ imm8[3:2] ] -tmp.dword[2] := b.dword[ imm8[5:4] ] -tmp.dword[3] := b.dword[ imm8[7:6] ] -FOR j := 0 to 1 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -tmp.dword[0] := b.dword[ imm8[1:0] ] -tmp.dword[1] := b.dword[ imm8[3:2] ] -tmp.dword[2] := b.dword[ imm8[5:4] ] -tmp.dword[3] := b.dword[ imm8[7:6] ] -FOR j := 0 to 1 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := b[i+7:i] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := b[i+7:i] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := b[i+15:i] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := b[i+15:i] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := idx[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - off := 16*idx[i+3:i] - dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off] -ENDFOR -dst[MAX:256] := 0 - - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := idx[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - off := 16*idx[i+2:i] - dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off] -ENDFOR -dst[MAX:128] := 0 - - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - id := idx[i+3:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - id := idx[i+3:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - id := idx[i+3:i]*16 - dst[i+15:i] := a[id+15:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - id := idx[i+2:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - id := idx[i+2:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 16-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - id := idx[i+2:i]*16 - dst[i+15:i] := a[id+15:id] -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". - -FOR j := 0 to 31 - i := j*8 - IF a[i+7] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". - -FOR j := 0 to 15 - i := j*8 - IF a[i+7] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := 0xFF - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := 0xFF - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := 0xFFFF - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := 0xFFFF - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". - -FOR j := 0 to 15 - i := j*16 - IF a[i+15] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". - -FOR j := 0 to 7 - i := j*16 - IF a[i+15] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[4:0] := b[i+3:i] + (j & 0x10) - dst[i+7:i] := a[index*8+7:index*8] - FI - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[4:0] := b[i+3:i] + (j & 0x10) - dst[i+7:i] := a[index*8+7:index*8] - FI - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 16 packed 16-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 32 packed 8-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 8 packed 16-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 16 packed 8-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Load -
- - - - - - Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Move -
- - - - - - Store packed 16-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*16 - IF k[j] - MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 16-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*16 - IF k[j] - MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 8-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*8 - IF k[j] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 8-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*8 - IF k[j] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 16 packed 16-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 32 packed 8-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 8 packed 16-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 16 packed 8-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512BW - AVX512VL -
immintrin.h
- Store -
- - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - dst[l+7:l] := Saturate8(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - dst[l+7:l] := Saturate8(a[i+15:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - dst[l+7:l] := SaturateU8(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - dst[l+7:l] := SaturateU8(a[i+15:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - dst[l+7:l] := Truncate8(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - dst[l+7:l] := Truncate8(a[i+15:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Convert -
- - - - - - Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - - Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 16-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Set -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 31 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 15 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 15 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 7 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 31 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 31 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 15 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 15 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 15 - i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 15 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 7 - i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 7 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512BW - AVX512VL -
immintrin.h
- Shift -
- - - - Reduce the packed 16-bit integers in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[15:0] + src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_ADD(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_ADD(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[15:0] + src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_ADD(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0 - FI -ENDFOR -dst[15:0] := REDUCE_ADD(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[15:0] + src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] + src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_ADD(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_ADD(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[15:0] + src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] + src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_ADD(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0 - FI -ENDFOR -dst[15:0] := REDUCE_ADD(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[7:0] + src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_ADD(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_ADD(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[7:0] + src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_ADD(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0 - FI -ENDFOR -dst[7:0] := REDUCE_ADD(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[7:0] + src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] + src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_ADD(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_ADD(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[7:0] + src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] + src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_ADD(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0 - FI -ENDFOR -dst[7:0] := REDUCE_ADD(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[15:0] * src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_MUL(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MUL(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[15:0] * src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_MUL(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 1 - FI -ENDFOR -dst[15:0] := REDUCE_MUL(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[15:0] * src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] * src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_MUL(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MUL(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[15:0] * src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] * src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_MUL(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 1 - FI -ENDFOR -dst[15:0] := REDUCE_MUL(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[7:0] * src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_MUL(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MUL(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[7:0] * src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_MUL(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 1 - FI -ENDFOR -dst[7:0] := REDUCE_MUL(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[7:0] * src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] * src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_MUL(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MUL(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[7:0] * src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] * src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_MUL(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 1 - FI -ENDFOR -dst[7:0] := REDUCE_MUL(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[15:0] OR src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_OR(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_OR(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[15:0] OR src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_OR(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0 - FI -ENDFOR -dst[15:0] := REDUCE_OR(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[15:0] OR src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] OR src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_OR(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_OR(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[15:0] OR src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] OR src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_OR(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0 - FI -ENDFOR -dst[15:0] := REDUCE_OR(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[7:0] OR src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_OR(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_OR(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[7:0] OR src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_OR(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0 - FI -ENDFOR -dst[7:0] := REDUCE_OR(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[7:0] OR src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] OR src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_OR(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_OR(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[7:0] OR src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] OR src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_OR(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0 - FI -ENDFOR -dst[7:0] := REDUCE_OR(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[15:0] AND src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_AND(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_AND(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[15:0] AND src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_AND(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0xFFFF - FI -ENDFOR -dst[15:0] := REDUCE_AND(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 16-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[15:0] AND src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] AND src[i+16*len+31:i+16*len] - ENDFOR - RETURN REDUCE_AND(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_AND(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 16-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[15:0] AND src[31:16] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := src[i+15:i] AND src[i+16*len+15:i+16*len] - ENDFOR - RETURN REDUCE_AND(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0xFFFF - FI -ENDFOR -dst[15:0] := REDUCE_AND(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[7:0] AND src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_AND(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_AND(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[7:0] AND src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_AND(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0xFF - FI -ENDFOR -dst[7:0] := REDUCE_AND(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 8-bit integers in "a" by multiplication. Returns the sum of all elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[7:0] AND src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] AND src[i+8*len+15:i+8*len] - ENDFOR - RETURN REDUCE_AND(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_AND(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 8-bit integers in "a" by multiplication using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[7:0] AND src[15:8] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := src[i+7:i] AND src[i+8*len+7:i+8*len] - ENDFOR - RETURN REDUCE_AND(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0xFF - FI -ENDFOR -dst[7:0] := REDUCE_AND(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed signed 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MAX(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := Int16(-0x8000) - FI -ENDFOR -dst[15:0] := REDUCE_MAX(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MAX(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := Int16(-0x8000) - FI -ENDFOR -dst[15:0] := REDUCE_MAX(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MAX(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := Int8(-0x80) - FI -ENDFOR -dst[7:0] := REDUCE_MAX(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MAX(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := Int8(-0x80) - FI -ENDFOR -dst[7:0] := REDUCE_MAX(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MAX(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0 - FI -ENDFOR -dst[15:0] := REDUCE_MAX(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 16-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MAX(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 16-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[15:0] > src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] > src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MAX(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0 - FI -ENDFOR -dst[15:0] := REDUCE_MAX(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MAX(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0 - FI -ENDFOR -dst[7:0] := REDUCE_MAX(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 8-bit integers in "a" by maximum. Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MAX(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 8-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[7:0] > src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] > src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MAX(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0 - FI -ENDFOR -dst[7:0] := REDUCE_MAX(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MIN(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := Int16(0x7FFF) - FI -ENDFOR -dst[15:0] := REDUCE_MIN(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MIN(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := Int16(0x7FFF) - FI -ENDFOR -dst[15:0] := REDUCE_MIN(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MIN(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := Int8(0x7F) - FI -ENDFOR -dst[7:0] := REDUCE_MIN(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MIN(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 31 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := Int8(0x7F) - FI -ENDFOR -dst[7:0] := REDUCE_MIN(tmp, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MIN(a, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0xFFFF - FI -ENDFOR -dst[15:0] := REDUCE_MIN(tmp, 8) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 16-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -dst[15:0] := REDUCE_MIN(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 16-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[15:0] < src[31:16] ? src[15:0] : src[31:16]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*16 - src[i+15:i] := (src[i+15:i] < src[i+16*len+15:i+16*len] ? src[i+15:i] : src[i+16*len+15:i+16*len]) - ENDFOR - RETURN REDUCE_MIN(src[16*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[i+15:i] := a[i+15:i] - ELSE - tmp[i+15:i] := 0xFFFF - FI -ENDFOR -dst[15:0] := REDUCE_MIN(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MIN(a, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0xFF - FI -ENDFOR -dst[7:0] := REDUCE_MIN(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 8-bit integers in "a" by minimum. Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -dst[7:0] := REDUCE_MIN(a, 32) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 8-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[7:0] < src[15:8] ? src[7:0] : src[15:8]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*8 - src[i+7:i] := (src[i+7:i] < src[i+8*len+7:i+8*len] ? src[i+7:i] : src[i+8*len+7:i+8*len]) - ENDFOR - RETURN REDUCE_MIN(src[8*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*8 - IF k[j] - tmp[i+7:i] := a[i+7:i] - ELSE - tmp[i+7:i] := 0xFF - FI -ENDFOR -dst[7:0] := REDUCE_MIN(tmp, 16) - - AVX512BW - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Unpack and interleave 32 bits from masks "a" and "b", and store the 64-bit result in "dst". - -dst[31:0] := b[31:0] -dst[63:32] := a[31:0] -dst[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Unpack and interleave 16 bits from masks "a" and "b", and store the 32-bit result in "dst". - -dst[15:0] := b[15:0] -dst[31:16] := a[15:0] -dst[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 3 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 7 - i := j*64 - dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 3 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 7 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from "a", and the last two SADs use the uppper 8-bit quadruplet of the lane from "a". Quadruplets from "b" are selected from within 128-bit lanes according to the control in "imm8", and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets. - -FOR i := 0 to 3 - tmp.m128[i].dword[0] := b.m128[i].dword[ imm8[1:0] ] - tmp.m128[i].dword[1] := b.m128[i].dword[ imm8[3:2] ] - tmp.m128[i].dword[2] := b.m128[i].dword[ imm8[5:4] ] - tmp.m128[i].dword[3] := b.m128[i].dword[ imm8[7:6] ] -ENDFOR -FOR j := 0 to 7 - i := j*64 - tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8]) +\ - ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24]) - - tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16]) +\ - ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32]) - - tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24]) +\ - ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40]) - - tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32]) +\ - ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48]) -ENDFOR -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - -FOR j := 0 to 3 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - dst[i+127:i] := tmp[127:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate pairs of 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*128 - tmp[255:0] := ((a[i+127:i] << 128)[255:0] OR b[i+127:i]) >> (imm8*8) - tmp_dst[i+127:i] := tmp[127:0] -ENDFOR -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 8-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := b[i+7:i] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := b[i+15:i] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 8-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := idx[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - off := 16*idx[i+4:i] - dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off] -ENDFOR -dst[MAX:512] := 0 - - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - id := idx[i+4:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - id := idx[i+4:i]*16 - IF k[j] - dst[i+15:i] := a[id+15:id] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 16-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - id := idx[i+4:i]*16 - dst[i+15:i] := a[id+15:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 8-bit integer in "a". - -FOR j := 0 to 63 - i := j*8 - IF a[i+7] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - Set each packed 8-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := 0xFF - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - Set each packed 16-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := 0xFFFF - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 16-bit integer in "a". - -FOR j := 0 to 31 - i := j*16 - IF a[i+15] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce eight unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". - -FOR j := 0 to 63 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -FOR j := 0 to 7 - i := j*64 - dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ - tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] - dst[i+63:i+16] := 0 -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 8-bit integers in "a" within 128-bit lanes using the control in the corresponding 8-bit element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[5:0] := b[i+3:i] + (j & 0x30) - dst[i+7:i] := a[index*8+7:index*8] - FI - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Swizzle -
- - - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[5:0] := b[i+3:i] + (j & 0x30) - dst[i+7:i] := a[index*8+7:index*8] - FI - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Swizzle -
- - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[5:0] := b[i+3:i] + (j & 0x30) - dst[i+7:i] := a[index*8+7:index*8] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -tmp_dst[319:256] := a[319:256] -tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] -tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] -tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] -tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] -tmp_dst[447:384] := a[447:384] -tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] -tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] -tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] -tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := a[63:0] -tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -tmp_dst[191:128] := a[191:128] -tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -tmp_dst[319:256] := a[319:256] -tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] -tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] -tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] -tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] -tmp_dst[447:384] := a[447:384] -tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] -tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] -tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] -tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the high 64 bits of 128-bit lanes of "dst", with the low 64 bits of 128-bit lanes being copied from from "a" to "dst". - -dst[63:0] := a[63:0] -dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] -dst[191:128] := a[191:128] -dst[207:192] := (a >> (imm8[1:0] * 16))[207:192] -dst[223:208] := (a >> (imm8[3:2] * 16))[207:192] -dst[239:224] := (a >> (imm8[5:4] * 16))[207:192] -dst[255:240] := (a >> (imm8[7:6] * 16))[207:192] -dst[319:256] := a[319:256] -dst[335:320] := (a >> (imm8[1:0] * 16))[335:320] -dst[351:336] := (a >> (imm8[3:2] * 16))[335:320] -dst[367:352] := (a >> (imm8[5:4] * 16))[335:320] -dst[383:368] := (a >> (imm8[7:6] * 16))[335:320] -dst[447:384] := a[447:384] -dst[463:448] := (a >> (imm8[1:0] * 16))[463:448] -dst[479:464] := (a >> (imm8[3:2] * 16))[463:448] -dst[495:480] := (a >> (imm8[5:4] * 16))[463:448] -dst[511:496] := (a >> (imm8[7:6] * 16))[463:448] -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] -tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] -tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] -tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] -tmp_dst[383:320] := a[383:320] -tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] -tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] -tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] -tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] -tmp_dst[511:448] := a[511:448] -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst", using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -tmp_dst[127:64] := a[127:64] -tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -tmp_dst[255:192] := a[255:192] -tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] -tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] -tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] -tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] -tmp_dst[383:320] := a[383:320] -tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] -tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] -tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] -tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] -tmp_dst[511:448] := a[511:448] -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of "a" using the control in "imm8". Store the results in the low 64 bits of 128-bit lanes of "dst", with the high 64 bits of 128-bit lanes being copied from from "a" to "dst". - -dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -dst[127:64] := a[127:64] -dst[143:128] := (a >> (imm8[1:0] * 16))[143:128] -dst[159:144] := (a >> (imm8[3:2] * 16))[143:128] -dst[175:160] := (a >> (imm8[5:4] * 16))[143:128] -dst[191:176] := (a >> (imm8[7:6] * 16))[143:128] -dst[255:192] := a[255:192] -dst[271:256] := (a >> (imm8[1:0] * 16))[271:256] -dst[287:272] := (a >> (imm8[3:2] * 16))[271:256] -dst[303:288] := (a >> (imm8[5:4] * 16))[271:256] -dst[319:304] := (a >> (imm8[7:6] * 16))[271:256] -dst[383:320] := a[383:320] -dst[399:384] := (a >> (imm8[1:0] * 16))[399:384] -dst[415:400] := (a >> (imm8[3:2] * 16))[399:384] -dst[431:416] := (a >> (imm8[5:4] * 16))[399:384] -dst[447:432] := (a >> (imm8[7:6] * 16))[399:384] -dst[511:448] := a[511:448] -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Unpack and interleave 8-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Unpack and interleave 16-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Unpack and interleave 8-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - Unpack and interleave 16-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Miscellaneous -
- - - - - - Load packed 16-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Load -
- - - - - Load packed 16-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Load -
- - - - - - Load packed 8-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Load -
- - - - - Load packed 8-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 32 packed 16-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 64 packed 8-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Load -
- - - - Load 32-bit mask from memory into "k". - -k[31:0] := MEM[mem_addr+31:mem_addr] - - - AVX512BW -
immintrin.h
- Load -
- - - - Load 64-bit mask from memory into "k". - -k[63:0] := MEM[mem_addr+63:mem_addr] - - - AVX512BW -
immintrin.h
- Load -
- - - - - - Move packed 16-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Move -
- - - - - Move packed 16-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Move -
- - - - - - Move packed 8-bit integers from "a" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Move -
- - - - - Move packed 8-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Move -
- - - - - - Store packed 16-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 31 - i := j*16 - IF k[j] - MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i] - FI -ENDFOR - - - AVX512BW -
immintrin.h
- Store -
- - - - - - Store packed 8-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 63 - i := j*8 - IF k[j] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - - AVX512BW -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 32 packed 16-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512BW -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 64 packed 8-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512BW -
immintrin.h
- Store -
- - - - - Store 32-bit mask from "a" into memory. - -MEM[mem_addr+31:mem_addr] := a[31:0] - - - AVX512BW -
immintrin.h
- Store -
- - - - - Store 64-bit mask from "a" into memory. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - AVX512BW -
immintrin.h
- Store -
- - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := ABS(a[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := ABS(a[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ABS(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ABS(a[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] + b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] + b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 8-bit integers in "a" by packed signed 8-bit integers in "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[i+7:i] - b[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[i+15:i] - b[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Arithmetic -
- - Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) -tmp_dst[271:256] := Saturate16(a[287:256]) -tmp_dst[287:272] := Saturate16(a[319:288]) -tmp_dst[303:288] := Saturate16(a[351:320]) -tmp_dst[319:304] := Saturate16(a[383:352]) -tmp_dst[335:320] := Saturate16(b[287:256]) -tmp_dst[351:336] := Saturate16(b[319:288]) -tmp_dst[367:352] := Saturate16(b[351:320]) -tmp_dst[383:368] := Saturate16(b[383:352]) -tmp_dst[399:384] := Saturate16(a[415:384]) -tmp_dst[415:400] := Saturate16(a[447:416]) -tmp_dst[431:416] := Saturate16(a[479:448]) -tmp_dst[447:432] := Saturate16(a[511:480]) -tmp_dst[463:448] := Saturate16(b[415:384]) -tmp_dst[479:464] := Saturate16(b[447:416]) -tmp_dst[495:480] := Saturate16(b[479:448]) -tmp_dst[511:496] := Saturate16(b[511:480]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := Saturate16(a[31:0]) -tmp_dst[31:16] := Saturate16(a[63:32]) -tmp_dst[47:32] := Saturate16(a[95:64]) -tmp_dst[63:48] := Saturate16(a[127:96]) -tmp_dst[79:64] := Saturate16(b[31:0]) -tmp_dst[95:80] := Saturate16(b[63:32]) -tmp_dst[111:96] := Saturate16(b[95:64]) -tmp_dst[127:112] := Saturate16(b[127:96]) -tmp_dst[143:128] := Saturate16(a[159:128]) -tmp_dst[159:144] := Saturate16(a[191:160]) -tmp_dst[175:160] := Saturate16(a[223:192]) -tmp_dst[191:176] := Saturate16(a[255:224]) -tmp_dst[207:192] := Saturate16(b[159:128]) -tmp_dst[223:208] := Saturate16(b[191:160]) -tmp_dst[239:224] := Saturate16(b[223:192]) -tmp_dst[255:240] := Saturate16(b[255:224]) -tmp_dst[271:256] := Saturate16(a[287:256]) -tmp_dst[287:272] := Saturate16(a[319:288]) -tmp_dst[303:288] := Saturate16(a[351:320]) -tmp_dst[319:304] := Saturate16(a[383:352]) -tmp_dst[335:320] := Saturate16(b[287:256]) -tmp_dst[351:336] := Saturate16(b[319:288]) -tmp_dst[367:352] := Saturate16(b[351:320]) -tmp_dst[383:368] := Saturate16(b[383:352]) -tmp_dst[399:384] := Saturate16(a[415:384]) -tmp_dst[415:400] := Saturate16(a[447:416]) -tmp_dst[431:416] := Saturate16(a[479:448]) -tmp_dst[447:432] := Saturate16(a[511:480]) -tmp_dst[463:448] := Saturate16(b[415:384]) -tmp_dst[479:464] := Saturate16(b[447:416]) -tmp_dst[495:480] := Saturate16(b[479:448]) -tmp_dst[511:496] := Saturate16(b[511:480]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(a[95:64]) -dst[63:48] := Saturate16(a[127:96]) -dst[79:64] := Saturate16(b[31:0]) -dst[95:80] := Saturate16(b[63:32]) -dst[111:96] := Saturate16(b[95:64]) -dst[127:112] := Saturate16(b[127:96]) -dst[143:128] := Saturate16(a[159:128]) -dst[159:144] := Saturate16(a[191:160]) -dst[175:160] := Saturate16(a[223:192]) -dst[191:176] := Saturate16(a[255:224]) -dst[207:192] := Saturate16(b[159:128]) -dst[223:208] := Saturate16(b[191:160]) -dst[239:224] := Saturate16(b[223:192]) -dst[255:240] := Saturate16(b[255:224]) -dst[271:256] := Saturate16(a[287:256]) -dst[287:272] := Saturate16(a[319:288]) -dst[303:288] := Saturate16(a[351:320]) -dst[319:304] := Saturate16(a[383:352]) -dst[335:320] := Saturate16(b[287:256]) -dst[351:336] := Saturate16(b[319:288]) -dst[367:352] := Saturate16(b[351:320]) -dst[383:368] := Saturate16(b[383:352]) -dst[399:384] := Saturate16(a[415:384]) -dst[415:400] := Saturate16(a[447:416]) -dst[431:416] := Saturate16(a[479:448]) -dst[447:432] := Saturate16(a[511:480]) -dst[463:448] := Saturate16(b[415:384]) -dst[479:464] := Saturate16(b[447:416]) -dst[495:480] := Saturate16(b[479:448]) -dst[511:496] := Saturate16(b[511:480]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) -tmp_dst[263:256] := Saturate8(a[271:256]) -tmp_dst[271:264] := Saturate8(a[287:272]) -tmp_dst[279:272] := Saturate8(a[303:288]) -tmp_dst[287:280] := Saturate8(a[319:304]) -tmp_dst[295:288] := Saturate8(a[335:320]) -tmp_dst[303:296] := Saturate8(a[351:336]) -tmp_dst[311:304] := Saturate8(a[367:352]) -tmp_dst[319:312] := Saturate8(a[383:368]) -tmp_dst[327:320] := Saturate8(b[271:256]) -tmp_dst[335:328] := Saturate8(b[287:272]) -tmp_dst[343:336] := Saturate8(b[303:288]) -tmp_dst[351:344] := Saturate8(b[319:304]) -tmp_dst[359:352] := Saturate8(b[335:320]) -tmp_dst[367:360] := Saturate8(b[351:336]) -tmp_dst[375:368] := Saturate8(b[367:352]) -tmp_dst[383:376] := Saturate8(b[383:368]) -tmp_dst[391:384] := Saturate8(a[399:384]) -tmp_dst[399:392] := Saturate8(a[415:400]) -tmp_dst[407:400] := Saturate8(a[431:416]) -tmp_dst[415:408] := Saturate8(a[447:432]) -tmp_dst[423:416] := Saturate8(a[463:448]) -tmp_dst[431:424] := Saturate8(a[479:464]) -tmp_dst[439:432] := Saturate8(a[495:480]) -tmp_dst[447:440] := Saturate8(a[511:496]) -tmp_dst[455:448] := Saturate8(b[399:384]) -tmp_dst[463:456] := Saturate8(b[415:400]) -tmp_dst[471:464] := Saturate8(b[431:416]) -tmp_dst[479:472] := Saturate8(b[447:432]) -tmp_dst[487:480] := Saturate8(b[463:448]) -tmp_dst[495:488] := Saturate8(b[479:464]) -tmp_dst[503:496] := Saturate8(b[495:480]) -tmp_dst[511:504] := Saturate8(b[511:496]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[7:0] := Saturate8(a[15:0]) -tmp_dst[15:8] := Saturate8(a[31:16]) -tmp_dst[23:16] := Saturate8(a[47:32]) -tmp_dst[31:24] := Saturate8(a[63:48]) -tmp_dst[39:32] := Saturate8(a[79:64]) -tmp_dst[47:40] := Saturate8(a[95:80]) -tmp_dst[55:48] := Saturate8(a[111:96]) -tmp_dst[63:56] := Saturate8(a[127:112]) -tmp_dst[71:64] := Saturate8(b[15:0]) -tmp_dst[79:72] := Saturate8(b[31:16]) -tmp_dst[87:80] := Saturate8(b[47:32]) -tmp_dst[95:88] := Saturate8(b[63:48]) -tmp_dst[103:96] := Saturate8(b[79:64]) -tmp_dst[111:104] := Saturate8(b[95:80]) -tmp_dst[119:112] := Saturate8(b[111:96]) -tmp_dst[127:120] := Saturate8(b[127:112]) -tmp_dst[135:128] := Saturate8(a[143:128]) -tmp_dst[143:136] := Saturate8(a[159:144]) -tmp_dst[151:144] := Saturate8(a[175:160]) -tmp_dst[159:152] := Saturate8(a[191:176]) -tmp_dst[167:160] := Saturate8(a[207:192]) -tmp_dst[175:168] := Saturate8(a[223:208]) -tmp_dst[183:176] := Saturate8(a[239:224]) -tmp_dst[191:184] := Saturate8(a[255:240]) -tmp_dst[199:192] := Saturate8(b[143:128]) -tmp_dst[207:200] := Saturate8(b[159:144]) -tmp_dst[215:208] := Saturate8(b[175:160]) -tmp_dst[223:216] := Saturate8(b[191:176]) -tmp_dst[231:224] := Saturate8(b[207:192]) -tmp_dst[239:232] := Saturate8(b[223:208]) -tmp_dst[247:240] := Saturate8(b[239:224]) -tmp_dst[255:248] := Saturate8(b[255:240]) -tmp_dst[263:256] := Saturate8(a[271:256]) -tmp_dst[271:264] := Saturate8(a[287:272]) -tmp_dst[279:272] := Saturate8(a[303:288]) -tmp_dst[287:280] := Saturate8(a[319:304]) -tmp_dst[295:288] := Saturate8(a[335:320]) -tmp_dst[303:296] := Saturate8(a[351:336]) -tmp_dst[311:304] := Saturate8(a[367:352]) -tmp_dst[319:312] := Saturate8(a[383:368]) -tmp_dst[327:320] := Saturate8(b[271:256]) -tmp_dst[335:328] := Saturate8(b[287:272]) -tmp_dst[343:336] := Saturate8(b[303:288]) -tmp_dst[351:344] := Saturate8(b[319:304]) -tmp_dst[359:352] := Saturate8(b[335:320]) -tmp_dst[367:360] := Saturate8(b[351:336]) -tmp_dst[375:368] := Saturate8(b[367:352]) -tmp_dst[383:376] := Saturate8(b[383:368]) -tmp_dst[391:384] := Saturate8(a[399:384]) -tmp_dst[399:392] := Saturate8(a[415:400]) -tmp_dst[407:400] := Saturate8(a[431:416]) -tmp_dst[415:408] := Saturate8(a[447:432]) -tmp_dst[423:416] := Saturate8(a[463:448]) -tmp_dst[431:424] := Saturate8(a[479:464]) -tmp_dst[439:432] := Saturate8(a[495:480]) -tmp_dst[447:440] := Saturate8(a[511:496]) -tmp_dst[455:448] := Saturate8(b[399:384]) -tmp_dst[463:456] := Saturate8(b[415:400]) -tmp_dst[471:464] := Saturate8(b[431:416]) -tmp_dst[479:472] := Saturate8(b[447:432]) -tmp_dst[487:480] := Saturate8(b[463:448]) -tmp_dst[495:488] := Saturate8(b[479:464]) -tmp_dst[503:496] := Saturate8(b[495:480]) -tmp_dst[511:504] := Saturate8(b[511:496]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(a[79:64]) -dst[47:40] := Saturate8(a[95:80]) -dst[55:48] := Saturate8(a[111:96]) -dst[63:56] := Saturate8(a[127:112]) -dst[71:64] := Saturate8(b[15:0]) -dst[79:72] := Saturate8(b[31:16]) -dst[87:80] := Saturate8(b[47:32]) -dst[95:88] := Saturate8(b[63:48]) -dst[103:96] := Saturate8(b[79:64]) -dst[111:104] := Saturate8(b[95:80]) -dst[119:112] := Saturate8(b[111:96]) -dst[127:120] := Saturate8(b[127:112]) -dst[135:128] := Saturate8(a[143:128]) -dst[143:136] := Saturate8(a[159:144]) -dst[151:144] := Saturate8(a[175:160]) -dst[159:152] := Saturate8(a[191:176]) -dst[167:160] := Saturate8(a[207:192]) -dst[175:168] := Saturate8(a[223:208]) -dst[183:176] := Saturate8(a[239:224]) -dst[191:184] := Saturate8(a[255:240]) -dst[199:192] := Saturate8(b[143:128]) -dst[207:200] := Saturate8(b[159:144]) -dst[215:208] := Saturate8(b[175:160]) -dst[223:216] := Saturate8(b[191:176]) -dst[231:224] := Saturate8(b[207:192]) -dst[239:232] := Saturate8(b[223:208]) -dst[247:240] := Saturate8(b[239:224]) -dst[255:248] := Saturate8(b[255:240]) -dst[263:256] := Saturate8(a[271:256]) -dst[271:264] := Saturate8(a[287:272]) -dst[279:272] := Saturate8(a[303:288]) -dst[287:280] := Saturate8(a[319:304]) -dst[295:288] := Saturate8(a[335:320]) -dst[303:296] := Saturate8(a[351:336]) -dst[311:304] := Saturate8(a[367:352]) -dst[319:312] := Saturate8(a[383:368]) -dst[327:320] := Saturate8(b[271:256]) -dst[335:328] := Saturate8(b[287:272]) -dst[343:336] := Saturate8(b[303:288]) -dst[351:344] := Saturate8(b[319:304]) -dst[359:352] := Saturate8(b[335:320]) -dst[367:360] := Saturate8(b[351:336]) -dst[375:368] := Saturate8(b[367:352]) -dst[383:376] := Saturate8(b[383:368]) -dst[391:384] := Saturate8(a[399:384]) -dst[399:392] := Saturate8(a[415:400]) -dst[407:400] := Saturate8(a[431:416]) -dst[415:408] := Saturate8(a[447:432]) -dst[423:416] := Saturate8(a[463:448]) -dst[431:424] := Saturate8(a[479:464]) -dst[439:432] := Saturate8(a[495:480]) -dst[447:440] := Saturate8(a[511:496]) -dst[455:448] := Saturate8(b[399:384]) -dst[463:456] := Saturate8(b[415:400]) -dst[471:464] := Saturate8(b[431:416]) -dst[479:472] := Saturate8(b[447:432]) -dst[487:480] := Saturate8(b[463:448]) -dst[495:488] := Saturate8(b[479:464]) -dst[503:496] := Saturate8(b[495:480]) -dst[511:504] := Saturate8(b[511:496]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -tmp_dst[271:256] := SaturateU16(a[287:256]) -tmp_dst[287:272] := SaturateU16(a[319:288]) -tmp_dst[303:288] := SaturateU16(a[351:320]) -tmp_dst[319:304] := SaturateU16(a[383:352]) -tmp_dst[335:320] := SaturateU16(b[287:256]) -tmp_dst[351:336] := SaturateU16(b[319:288]) -tmp_dst[367:352] := SaturateU16(b[351:320]) -tmp_dst[383:368] := SaturateU16(b[383:352]) -tmp_dst[399:384] := SaturateU16(a[415:384]) -tmp_dst[415:400] := SaturateU16(a[447:416]) -tmp_dst[431:416] := SaturateU16(a[479:448]) -tmp_dst[447:432] := SaturateU16(a[511:480]) -tmp_dst[463:448] := SaturateU16(b[415:384]) -tmp_dst[479:464] := SaturateU16(b[447:416]) -tmp_dst[495:480] := SaturateU16(b[479:448]) -tmp_dst[511:496] := SaturateU16(b[511:480]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[15:0] := SaturateU16(a[31:0]) -tmp_dst[31:16] := SaturateU16(a[63:32]) -tmp_dst[47:32] := SaturateU16(a[95:64]) -tmp_dst[63:48] := SaturateU16(a[127:96]) -tmp_dst[79:64] := SaturateU16(b[31:0]) -tmp_dst[95:80] := SaturateU16(b[63:32]) -tmp_dst[111:96] := SaturateU16(b[95:64]) -tmp_dst[127:112] := SaturateU16(b[127:96]) -tmp_dst[143:128] := SaturateU16(a[159:128]) -tmp_dst[159:144] := SaturateU16(a[191:160]) -tmp_dst[175:160] := SaturateU16(a[223:192]) -tmp_dst[191:176] := SaturateU16(a[255:224]) -tmp_dst[207:192] := SaturateU16(b[159:128]) -tmp_dst[223:208] := SaturateU16(b[191:160]) -tmp_dst[239:224] := SaturateU16(b[223:192]) -tmp_dst[255:240] := SaturateU16(b[255:224]) -tmp_dst[271:256] := SaturateU16(a[287:256]) -tmp_dst[287:272] := SaturateU16(a[319:288]) -tmp_dst[303:288] := SaturateU16(a[351:320]) -tmp_dst[319:304] := SaturateU16(a[383:352]) -tmp_dst[335:320] := SaturateU16(b[287:256]) -tmp_dst[351:336] := SaturateU16(b[319:288]) -tmp_dst[367:352] := SaturateU16(b[351:320]) -tmp_dst[383:368] := SaturateU16(b[383:352]) -tmp_dst[399:384] := SaturateU16(a[415:384]) -tmp_dst[415:400] := SaturateU16(a[447:416]) -tmp_dst[431:416] := SaturateU16(a[479:448]) -tmp_dst[447:432] := SaturateU16(a[511:480]) -tmp_dst[463:448] := SaturateU16(b[415:384]) -tmp_dst[479:464] := SaturateU16(b[447:416]) -tmp_dst[495:480] := SaturateU16(b[479:448]) -tmp_dst[511:496] := SaturateU16(b[511:480]) -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := tmp_dst[i+15:i] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". - -dst[15:0] := SaturateU16(a[31:0]) -dst[31:16] := SaturateU16(a[63:32]) -dst[47:32] := SaturateU16(a[95:64]) -dst[63:48] := SaturateU16(a[127:96]) -dst[79:64] := SaturateU16(b[31:0]) -dst[95:80] := SaturateU16(b[63:32]) -dst[111:96] := SaturateU16(b[95:64]) -dst[127:112] := SaturateU16(b[127:96]) -dst[143:128] := SaturateU16(a[159:128]) -dst[159:144] := SaturateU16(a[191:160]) -dst[175:160] := SaturateU16(a[223:192]) -dst[191:176] := SaturateU16(a[255:224]) -dst[207:192] := SaturateU16(b[159:128]) -dst[223:208] := SaturateU16(b[191:160]) -dst[239:224] := SaturateU16(b[223:192]) -dst[255:240] := SaturateU16(b[255:224]) -dst[271:256] := SaturateU16(a[287:256]) -dst[287:272] := SaturateU16(a[319:288]) -dst[303:288] := SaturateU16(a[351:320]) -dst[319:304] := SaturateU16(a[383:352]) -dst[335:320] := SaturateU16(b[287:256]) -dst[351:336] := SaturateU16(b[319:288]) -dst[367:352] := SaturateU16(b[351:320]) -dst[383:368] := SaturateU16(b[383:352]) -dst[399:384] := SaturateU16(a[415:384]) -dst[415:400] := SaturateU16(a[447:416]) -dst[431:416] := SaturateU16(a[479:448]) -dst[447:432] := SaturateU16(a[511:480]) -dst[463:448] := SaturateU16(b[415:384]) -dst[479:464] := SaturateU16(b[447:416]) -dst[495:480] := SaturateU16(b[479:448]) -dst[511:496] := SaturateU16(b[511:480]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) -tmp_dst[263:256] := SaturateU8(a[271:256]) -tmp_dst[271:264] := SaturateU8(a[287:272]) -tmp_dst[279:272] := SaturateU8(a[303:288]) -tmp_dst[287:280] := SaturateU8(a[319:304]) -tmp_dst[295:288] := SaturateU8(a[335:320]) -tmp_dst[303:296] := SaturateU8(a[351:336]) -tmp_dst[311:304] := SaturateU8(a[367:352]) -tmp_dst[319:312] := SaturateU8(a[383:368]) -tmp_dst[327:320] := SaturateU8(b[271:256]) -tmp_dst[335:328] := SaturateU8(b[287:272]) -tmp_dst[343:336] := SaturateU8(b[303:288]) -tmp_dst[351:344] := SaturateU8(b[319:304]) -tmp_dst[359:352] := SaturateU8(b[335:320]) -tmp_dst[367:360] := SaturateU8(b[351:336]) -tmp_dst[375:368] := SaturateU8(b[367:352]) -tmp_dst[383:376] := SaturateU8(b[383:368]) -tmp_dst[391:384] := SaturateU8(a[399:384]) -tmp_dst[399:392] := SaturateU8(a[415:400]) -tmp_dst[407:400] := SaturateU8(a[431:416]) -tmp_dst[415:408] := SaturateU8(a[447:432]) -tmp_dst[423:416] := SaturateU8(a[463:448]) -tmp_dst[431:424] := SaturateU8(a[479:464]) -tmp_dst[439:432] := SaturateU8(a[495:480]) -tmp_dst[447:440] := SaturateU8(a[511:496]) -tmp_dst[455:448] := SaturateU8(b[399:384]) -tmp_dst[463:456] := SaturateU8(b[415:400]) -tmp_dst[471:464] := SaturateU8(b[431:416]) -tmp_dst[479:472] := SaturateU8(b[447:432]) -tmp_dst[487:480] := SaturateU8(b[463:448]) -tmp_dst[495:488] := SaturateU8(b[479:464]) -tmp_dst[503:496] := SaturateU8(b[495:480]) -tmp_dst[511:504] := SaturateU8(b[511:496]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[7:0] := SaturateU8(a[15:0]) -tmp_dst[15:8] := SaturateU8(a[31:16]) -tmp_dst[23:16] := SaturateU8(a[47:32]) -tmp_dst[31:24] := SaturateU8(a[63:48]) -tmp_dst[39:32] := SaturateU8(a[79:64]) -tmp_dst[47:40] := SaturateU8(a[95:80]) -tmp_dst[55:48] := SaturateU8(a[111:96]) -tmp_dst[63:56] := SaturateU8(a[127:112]) -tmp_dst[71:64] := SaturateU8(b[15:0]) -tmp_dst[79:72] := SaturateU8(b[31:16]) -tmp_dst[87:80] := SaturateU8(b[47:32]) -tmp_dst[95:88] := SaturateU8(b[63:48]) -tmp_dst[103:96] := SaturateU8(b[79:64]) -tmp_dst[111:104] := SaturateU8(b[95:80]) -tmp_dst[119:112] := SaturateU8(b[111:96]) -tmp_dst[127:120] := SaturateU8(b[127:112]) -tmp_dst[135:128] := SaturateU8(a[143:128]) -tmp_dst[143:136] := SaturateU8(a[159:144]) -tmp_dst[151:144] := SaturateU8(a[175:160]) -tmp_dst[159:152] := SaturateU8(a[191:176]) -tmp_dst[167:160] := SaturateU8(a[207:192]) -tmp_dst[175:168] := SaturateU8(a[223:208]) -tmp_dst[183:176] := SaturateU8(a[239:224]) -tmp_dst[191:184] := SaturateU8(a[255:240]) -tmp_dst[199:192] := SaturateU8(b[143:128]) -tmp_dst[207:200] := SaturateU8(b[159:144]) -tmp_dst[215:208] := SaturateU8(b[175:160]) -tmp_dst[223:216] := SaturateU8(b[191:176]) -tmp_dst[231:224] := SaturateU8(b[207:192]) -tmp_dst[239:232] := SaturateU8(b[223:208]) -tmp_dst[247:240] := SaturateU8(b[239:224]) -tmp_dst[255:248] := SaturateU8(b[255:240]) -tmp_dst[263:256] := SaturateU8(a[271:256]) -tmp_dst[271:264] := SaturateU8(a[287:272]) -tmp_dst[279:272] := SaturateU8(a[303:288]) -tmp_dst[287:280] := SaturateU8(a[319:304]) -tmp_dst[295:288] := SaturateU8(a[335:320]) -tmp_dst[303:296] := SaturateU8(a[351:336]) -tmp_dst[311:304] := SaturateU8(a[367:352]) -tmp_dst[319:312] := SaturateU8(a[383:368]) -tmp_dst[327:320] := SaturateU8(b[271:256]) -tmp_dst[335:328] := SaturateU8(b[287:272]) -tmp_dst[343:336] := SaturateU8(b[303:288]) -tmp_dst[351:344] := SaturateU8(b[319:304]) -tmp_dst[359:352] := SaturateU8(b[335:320]) -tmp_dst[367:360] := SaturateU8(b[351:336]) -tmp_dst[375:368] := SaturateU8(b[367:352]) -tmp_dst[383:376] := SaturateU8(b[383:368]) -tmp_dst[391:384] := SaturateU8(a[399:384]) -tmp_dst[399:392] := SaturateU8(a[415:400]) -tmp_dst[407:400] := SaturateU8(a[431:416]) -tmp_dst[415:408] := SaturateU8(a[447:432]) -tmp_dst[423:416] := SaturateU8(a[463:448]) -tmp_dst[431:424] := SaturateU8(a[479:464]) -tmp_dst[439:432] := SaturateU8(a[495:480]) -tmp_dst[447:440] := SaturateU8(a[511:496]) -tmp_dst[455:448] := SaturateU8(b[399:384]) -tmp_dst[463:456] := SaturateU8(b[415:400]) -tmp_dst[471:464] := SaturateU8(b[431:416]) -tmp_dst[479:472] := SaturateU8(b[447:432]) -tmp_dst[487:480] := SaturateU8(b[463:448]) -tmp_dst[495:488] := SaturateU8(b[479:464]) -tmp_dst[503:496] := SaturateU8(b[495:480]) -tmp_dst[511:504] := SaturateU8(b[511:496]) -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := tmp_dst[i+7:i] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Miscellaneous - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(a[79:64]) -dst[47:40] := SaturateU8(a[95:80]) -dst[55:48] := SaturateU8(a[111:96]) -dst[63:56] := SaturateU8(a[127:112]) -dst[71:64] := SaturateU8(b[15:0]) -dst[79:72] := SaturateU8(b[31:16]) -dst[87:80] := SaturateU8(b[47:32]) -dst[95:88] := SaturateU8(b[63:48]) -dst[103:96] := SaturateU8(b[79:64]) -dst[111:104] := SaturateU8(b[95:80]) -dst[119:112] := SaturateU8(b[111:96]) -dst[127:120] := SaturateU8(b[127:112]) -dst[135:128] := SaturateU8(a[143:128]) -dst[143:136] := SaturateU8(a[159:144]) -dst[151:144] := SaturateU8(a[175:160]) -dst[159:152] := SaturateU8(a[191:176]) -dst[167:160] := SaturateU8(a[207:192]) -dst[175:168] := SaturateU8(a[223:208]) -dst[183:176] := SaturateU8(a[239:224]) -dst[191:184] := SaturateU8(a[255:240]) -dst[199:192] := SaturateU8(b[143:128]) -dst[207:200] := SaturateU8(b[159:144]) -dst[215:208] := SaturateU8(b[175:160]) -dst[223:216] := SaturateU8(b[191:176]) -dst[231:224] := SaturateU8(b[207:192]) -dst[239:232] := SaturateU8(b[223:208]) -dst[247:240] := SaturateU8(b[239:224]) -dst[255:248] := SaturateU8(b[255:240]) -dst[263:256] := SaturateU8(a[271:256]) -dst[271:264] := SaturateU8(a[287:272]) -dst[279:272] := SaturateU8(a[303:288]) -dst[287:280] := SaturateU8(a[319:304]) -dst[295:288] := SaturateU8(a[335:320]) -dst[303:296] := SaturateU8(a[351:336]) -dst[311:304] := SaturateU8(a[367:352]) -dst[319:312] := SaturateU8(a[383:368]) -dst[327:320] := SaturateU8(b[271:256]) -dst[335:328] := SaturateU8(b[287:272]) -dst[343:336] := SaturateU8(b[303:288]) -dst[351:344] := SaturateU8(b[319:304]) -dst[359:352] := SaturateU8(b[335:320]) -dst[367:360] := SaturateU8(b[351:336]) -dst[375:368] := SaturateU8(b[367:352]) -dst[383:376] := SaturateU8(b[383:368]) -dst[391:384] := SaturateU8(a[399:384]) -dst[399:392] := SaturateU8(a[415:400]) -dst[407:400] := SaturateU8(a[431:416]) -dst[415:408] := SaturateU8(a[447:432]) -dst[423:416] := SaturateU8(a[463:448]) -dst[431:424] := SaturateU8(a[479:464]) -dst[439:432] := SaturateU8(a[495:480]) -dst[447:440] := SaturateU8(a[511:496]) -dst[455:448] := SaturateU8(b[399:384]) -dst[463:456] := SaturateU8(b[415:400]) -dst[471:464] := SaturateU8(b[431:416]) -dst[479:472] := SaturateU8(b[447:432]) -dst[487:480] := SaturateU8(b[463:448]) -dst[495:488] := SaturateU8(b[479:464]) -dst[503:496] := SaturateU8(b[495:480]) -dst[511:504] := SaturateU8(b[511:496]) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - dst[l+7:l] := Saturate8(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - l := j*16 - dst[l+15:l] := SignExtend16(a[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := SignExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - dst[l+7:l] := SaturateU8(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - dst[l+7:l] := Truncate8(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+15:i]) - FI -ENDFOR - - - AVX512BW -
immintrin.h
- Convert -
- - - - - Convert packed 16-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := 16*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+15:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - l := j*16 - dst[l+15:l] := ZeroExtend16(a[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - l := j*16 - IF k[j] - dst[l+15:l] := ZeroExtend16(a[i+7:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Convert -
- - - - - - Broadcast 8-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Set -
- - - - - Broadcast 8-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[7:0] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Set -
- - - - - - Broadcast 16-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Set -
- - - - - Broadcast the low packed 16-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[15:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Set -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 63 - i := j*8 - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 8-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 8-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 31 - i := j*16 - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 16-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compare packed signed 16-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 63 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 31 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 63 - i := j*8 - IF k1[j] - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 8-bit integers in "a" and "b", producing intermediate 8-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 63 - i := j*8 - k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 31 - i := j*16 - IF k1[j] - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 16-bit integers in "a" and "b", producing intermediate 16-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 31 - i := j*16 - k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Compare -
- - - - - Shift 128-bit lanes in "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) -dst[255:128] := a[255:128] << (tmp*8) -dst[383:256] := a[383:256] << (tmp*8) -dst[511:384] := a[511:384] << (tmp*8) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift 128-bit lanes in "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) -dst[255:128] := a[255:128] >> (tmp*8) -dst[383:256] := a[383:256] >> (tmp*8) -dst[511:384] := a[511:384] >> (tmp*8) -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF count[i+15:i] < 16 - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512BW -
immintrin.h
- Shift -
- - - - - Add 32-bit masks in "a" and "b", and store the result in "k". - -k[31:0] := a[31:0] + b[31:0] -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Add 64-bit masks in "a" and "b", and store the result in "k". - -k[63:0] := a[63:0] + b[63:0] -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 32-bit masks "a" and "b", and store the result in "k". - -k[31:0] := a[31:0] AND b[31:0] -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 64-bit masks "a" and "b", and store the result in "k". - -k[63:0] := a[63:0] AND b[63:0] -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 32-bit masks "a" and then AND with "b", and store the result in "k". - -k[31:0] := (NOT a[31:0]) AND b[31:0] -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 64-bit masks "a" and then AND with "b", and store the result in "k". - -k[63:0] := (NOT a[63:0]) AND b[63:0] -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - Compute the bitwise NOT of 32-bit mask "a", and store the result in "k". - -k[31:0] := NOT a[31:0] -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - Compute the bitwise NOT of 64-bit mask "a", and store the result in "k". - -k[63:0] := NOT a[63:0] -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 32-bit masks "a" and "b", and store the result in "k". - -k[31:0] := a[31:0] OR b[31:0] -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 64-bit masks "a" and "b", and store the result in "k". - -k[63:0] := a[63:0] OR b[63:0] -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise XNOR of 32-bit masks "a" and "b", and store the result in "k". - -k[31:0] := NOT (a[31:0] XOR b[31:0]) -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise XNOR of 64-bit masks "a" and "b", and store the result in "k". - -k[63:0] := NOT (a[63:0] XOR b[63:0]) -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise XOR of 32-bit masks "a" and "b", and store the result in "k". - -k[31:0] := a[31:0] XOR b[31:0] -k[MAX:32] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise XOR of 64-bit masks "a" and "b", and store the result in "k". - -k[63:0] := a[63:0] XOR b[63:0] -k[MAX:64] := 0 - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Shift the bits of 32-bit mask "a" left by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 31 - k[31:0] := a[31:0] << count[7:0] -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Shift the bits of 64-bit mask "a" left by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 63 - k[63:0] := a[63:0] << count[7:0] -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Shift the bits of 32-bit mask "a" right by "count" while shifting in zeros, and store the least significant 32 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 31 - k[31:0] := a[31:0] >> count[7:0] -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Shift the bits of 64-bit mask "a" right by "count" while shifting in zeros, and store the least significant 64 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 63 - k[63:0] := a[63:0] >> count[7:0] -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - - Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". - -tmp[31:0] := a[31:0] OR b[31:0] -IF tmp[31:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -IF tmp[31:0] == 0xFFFFFFFF - MEM[all_ones+7:all_ones] := 1 -ELSE - MEM[all_ones+7:all_ones] := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[31:0] := a[31:0] OR b[31:0] -IF tmp[31:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 32-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". - -tmp[31:0] := a[31:0] OR b[31:0] -IF tmp[31:0] == 0xFFFFFFFF - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - - Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". - -tmp[63:0] := a[63:0] OR b[63:0] -IF tmp[63:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -IF tmp[7:0] == 0xFFFFFFFFFFFFFFFF - MEM[all_ones+7:all_ones] := 1 -ELSE - MEM[all_ones+7:all_ones] := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[63:0] := a[63:0] OR b[63:0] -IF tmp[63:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 64-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". - -tmp[63:0] := a[63:0] OR b[63:0] -IF tmp[63:0] == 0xFFFFFFFFFFFFFFFF - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - - Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". - -tmp1[31:0] := a[31:0] AND b[31:0] -IF tmp1[31:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -tmp2[31:0] := (NOT a[31:0]) AND b[31:0] -IF tmp2[31:0] == 0x0 - MEM[and_not+7:and_not] := 1 -ELSE - MEM[and_not+7:and_not] := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 32-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". - -tmp[31:0] := a[31:0] AND b[31:0] -IF tmp[31:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 32-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[31:0] := (NOT a[31:0]) AND b[31:0] -IF tmp[31:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - - Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". - -tmp1[63:0] := a[63:0] AND b[63:0] -IF tmp1[63:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -tmp2[63:0] := (NOT a[63:0]) AND b[63:0] -IF tmp2[63:0] == 0x0 - MEM[and_not+7:and_not] := 1 -ELSE - MEM[and_not+7:and_not] := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 64-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". - -tmp[63:0] := a[63:0] AND b[63:0] -IF tmp[63:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 64-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[63:0] := (NOT a[63:0]) AND b[63:0] -IF tmp[63:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512BW -
immintrin.h
- Mask -
- - - - Convert 32-bit mask "a" into an integer value, and store the result in "dst". - -dst := ZeroExtend32(a[31:0]) - - - AVX512BW -
immintrin.h
- Mask -
- - - - Convert 64-bit mask "a" into an integer value, and store the result in "dst". - -dst := ZeroExtend64(a[63:0]) - - - AVX512BW -
immintrin.h
- Mask -
- - - - Convert integer value "a" into an 32-bit mask, and store the result in "k". - -k := ZeroExtend32(a[31:0]) - - - AVX512BW -
immintrin.h
- Mask -
- - - - Convert integer value "a" into an 64-bit mask, and store the result in "k". - -k := ZeroExtend64(a[63:0]) - - - AVX512BW -
immintrin.h
- Mask -
- - - - - - Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ZeroExtend64(k[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ZeroExtend64(k[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ZeroExtend32(k[15:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ZeroExtend32(k[15:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 7 - i := j*32 - FOR k := 0 to j-1 - m := k*32 - dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - FOR l := 0 to j-1 - m := l*32 - dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - FOR l := 0 to j-1 - m := l*32 - dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 3 - i := j*32 - FOR k := 0 to j-1 - m := k*32 - dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - FOR l := 0 to j-1 - m := l*32 - dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - FOR l := 0 to j-1 - m := l*32 - dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 3 - i := j*64 - FOR k := 0 to j-1 - m := k*64 - dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - FOR l := 0 to j-1 - m := l*64 - dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - FOR l := 0 to j-1 - m := l*64 - dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 1 - i := j*64 - FOR k := 0 to j-1 - m := k*64 - dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - FOR l := 0 to j-1 - m := l*64 - dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - FOR l := 0 to j-1 - m := l*64 - dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Compare -
- - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512CD - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Broadcast the low 8-bits from input mask "k" to all 64-bit elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ZeroExtend64(k[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Swizzle -
- - - - Broadcast the low 16-bits from input mask "k" to all 32-bit elements of "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ZeroExtend32(k[15:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Swizzle -
- - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 15 - i := j*32 - FOR k := 0 to j-1 - m := k*32 - dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Compare -
- - - - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - FOR l := 0 to j-1 - m := l*32 - dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Compare -
- - - - - Test each 32-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - FOR l := 0 to j-1 - m := l*32 - dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0 - ENDFOR - dst[i+31:i+j] := 0 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Compare -
- - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit. Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 7 - i := j*64 - FOR k := 0 to j-1 - m := k*64 - dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Compare -
- - - - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 7 - i := j*64 - IF k[j] - FOR l := 0 to j-1 - m := l*64 - dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Compare -
- - - - - Test each 64-bit element of "a" for equality with all other elements in "a" closer to the least significant bit using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in "dst". - -FOR j := 0 to 7 - i := j*64 - IF k[j] - FOR l := 0 to j-1 - m := l*64 - dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0 - ENDFOR - dst[i+63:i+j] := 0 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Compare -
- - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Bit Manipulation -
- - - - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Bit Manipulation -
- - - - - Counts the number of leading zero bits in each packed 32-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp := 31 - dst[i+31:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+31:i] := dst[i+31:i] + 1 - OD - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Bit Manipulation -
- - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Bit Manipulation -
- - - - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Bit Manipulation -
- - - - - Counts the number of leading zero bits in each packed 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp := 63 - dst[i+63:i] := 0 - DO WHILE (tmp >= 0 AND a[i+tmp] == 0) - tmp := tmp - 1 - dst[i+63:i] := dst[i+63:i] + 1 - OD - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512CD -
immintrin.h
- Bit Manipulation -
- - - - - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Logical -
- - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. - -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. - -FOR j := 0 to 3 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 3 - i := j*64 - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) -ENDFOR -k[MAX:4] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 1 - i := j*64 - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) -ENDFOR -k[MAX:2] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 7 - i := j*32 - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) -ENDFOR -k[MAX:8] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 3 - i := j*32 - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) -ENDFOR -k[MAX:4] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE imm8[0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE imm8[0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". - -FOR j := 0 to 7 - i := j*32 - IF a[i+31] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". - -FOR j := 0 to 3 - i := j*32 - IF a[i+31] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := 0xFFFFFFFF - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := 0xFFFFFFFF - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := 0xFFFFFFFFFFFFFFFF - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := 0xFFFFFFFFFFFFFFFF - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". - -FOR j := 0 to 3 - i := j*64 - IF a[i+63] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". - -FOR j := 0 to 1 - i := j*64 - IF a[i+63] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - RETURN tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Logical -
- - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 8 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 2 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst. - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the lower 2 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 2)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 8 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 8)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 2 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 2)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract 256 bits (composed of 8 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract 128 bits (composed of 2 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 7 - i := j*64 - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) -ENDFOR -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Test packed double-precision (64-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR j := 0 to 15 - i := j*32 - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) -ENDFOR -k[MAX:16] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Test packed single-precision (32-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0]) - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k". - [fpclass_note] - k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) -k[MAX:1] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Test the lower double-precision (64-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - [fpclass_note] - IF k1[0] - k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0]) -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k. - [fpclass_note] - k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) -k[MAX:1] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Test the lower single-precision (32-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - [fpclass_note] - IF k1[0] - k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0]) -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[0]) OF -0: dst[255:0] := b[255:0] -1: dst[511:256] := b[255:0] -ESAC -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE imm8[1:0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -2: dst[383:256] := b[127:0] -3: dst[511:384] := b[127:0] -ESAC -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE imm8[0] OF -0: dst[255:0] := b[255:0] -1: dst[511:256] := b[255:0] -ESAC -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 8 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE imm8[1:0] OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -2: dst[383:256] := b[127:0] -3: dst[511:384] := b[127:0] -ESAC -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 2 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 32-bit integer in "a". - -FOR j := 0 to 15 - i := j*32 - IF a[i+31] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Set each packed 32-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := 0xFFFFFFFF - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Set each packed 64-bit integer in "dst" to all ones or all zeros based on the value of the corresponding bit in "k". - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := 0xFFFFFFFFFFFFFFFF - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - Set each bit of mask register "k" based on the most significant bit of the corresponding packed 64-bit integer in "a". - -FOR j := 0 to 7 - i := j*64 - IF a[i+63] - k[j] := 1 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[63:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0] - 1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0] - 2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0] - 3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0]) - 1: dst[63:0] := tmp[63:0] - 2: dst[63:0] := (0 << 63) OR (tmp[62:0]) - 3: dst[63:0] := (1 << 63) OR (tmp[62:0]) - ESAC - - RETURN dst -} -dst[63:0] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -IF k[0] - dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Calculate the max, min, absolute max, or absolute min (depending on control in "imm8") for the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute min, 11 = absolute max. - imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit. [sae_note] - -DEFINE RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0]) { - CASE opCtl[1:0] OF - 0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0] - 1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0] - 2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0] - 3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0] - ESAC - - CASE signSelCtl[1:0] OF - 0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0]) - 1: dst[31:0] := tmp[31:0] - 2: dst[31:0] := (0 << 31) OR (tmp[30:0]) - 3: dst[31:0] := (1 << 31) OR (tmp[30:0]) - ESAC - - RETURN dst -} -dst[31:0] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed double-precision (64-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ReduceArgumentPD(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed single-precision (32-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ReduceArgumentPS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of the lower double-precision (64-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPD(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - tmp[63:0] := src1[63:0] - tmp[63:0] - IF IsInf(tmp[63:0]) - tmp[63:0] := FP64(0.0) - FI - RETURN tmp[63:0] -} -dst[63:0] := ReduceArgumentPD(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of the lower single-precision (32-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentPS(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - tmp[31:0] := src1[31:0] - tmp[31:0] - IF IsInf(tmp[31:0]) - tmp[31:0] := FP32(0.0) - FI - RETURN tmp[31:0] -} -dst[31:0] := ReduceArgumentPS(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512DQ -
immintrin.h
- Miscellaneous -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_FP64_To_UInt64_Truncate(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_FP32_To_UInt64_Truncate(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512DQ -
immintrin.h
- Convert -
- - - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 64-bit integers in "a" and "b", producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := a[i+63:i] * b[i+63:i] - dst[i+63:i] := tmp[63:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512DQ -
immintrin.h
- Arithmetic -
- - - - - Add 8-bit masks in "a" and "b", and store the result in "k". - -k[7:0] := a[7:0] + b[7:0] -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Add 16-bit masks in "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] + b[15:0] -k[MAX:16] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 8-bit masks "a" and "b", and store the result in "k". - -k[7:0] := a[7:0] AND b[7:0] -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 8-bit masks "a" and then AND with "b", and store the result in "k". - -k[7:0] := (NOT a[7:0]) AND b[7:0] -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - Compute the bitwise NOT of 8-bit mask "a", and store the result in "k". - -k[7:0] := NOT a[7:0] -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 8-bit masks "a" and "b", and store the result in "k". - -k[7:0] := a[7:0] OR b[7:0] -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise XNOR of 8-bit masks "a" and "b", and store the result in "k". - -k[7:0] := NOT (a[7:0] XOR b[7:0]) -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise XOR of 8-bit masks "a" and "b", and store the result in "k". - -k[7:0] := a[7:0] XOR b[7:0] -k[MAX:8] := 0 - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Shift the bits of 8-bit mask "a" left by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 7 - k[7:0] := a[7:0] << count[7:0] -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Shift the bits of 8-bit mask "a" right by "count" while shifting in zeros, and store the least significant 8 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 7 - k[7:0] := a[7:0] >> count[7:0] -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - - Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". - -tmp[7:0] := a[7:0] OR b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -IF tmp[7:0] == 0xFF - MEM[all_ones+7:all_ones] := 1 -ELSE - MEM[all_ones+7:all_ones] := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[7:0] := a[7:0] OR b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 8-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". - -tmp[7:0] := a[7:0] OR b[7:0] -IF tmp[7:0] == 0xFF - dst := 1 -ELSE - dst := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - - Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". - -tmp1[7:0] := a[7:0] AND b[7:0] -IF tmp1[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -tmp2[7:0] := (NOT a[7:0]) AND b[7:0] -IF tmp2[7:0] == 0x0 - MEM[and_not+7:and_not] := 1 -ELSE - MEM[and_not+7:and_not] := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 8-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". - -tmp[7:0] := a[7:0] AND b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 8-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[7:0] := (NOT a[7:0]) AND b[7:0] -IF tmp[7:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". Compute the bitwise NOT of "a" and then AND with "b", if the result is all zeros, store 1 in "and_not", otherwise store 0 in "and_not". - -tmp1[15:0] := a[15:0] AND b[15:0] -IF tmp1[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -tmp2[15:0] := (NOT a[15:0]) AND b[15:0] -IF tmp2[15:0] == 0x0 - MEM[and_not+7:and_not] := 1 -ELSE - MEM[and_not+7:and_not] := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and if the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". - -tmp[15:0] := a[15:0] AND b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 16-bit mask "a" and then AND with "b", if the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[15:0] := (NOT a[15:0]) AND b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512DQ -
immintrin.h
- Mask -
- - - - Convert 8-bit mask "a" into an integer value, and store the result in "dst". - -dst := ZeroExtend32(a[7:0]) - - - AVX512DQ -
immintrin.h
- Mask -
- - - - Convert integer value "a" into an 8-bit mask, and store the result in "k". - -k := a[7:0] - - - AVX512DQ -
immintrin.h
- Mask -
- - - - Load 8-bit mask from memory into "k". - -k[7:0] := MEM[mem_addr+7:mem_addr] - - - AVX512DQ -
immintrin.h
- Load -
- - - - - Store 8-bit mask from "a" into memory. - -MEM[mem_addr+7:mem_addr] := a[7:0] - - - AVX512DQ -
immintrin.h
- Store -
- - - - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ACOS(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ACOS(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ACOS(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ACOS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ACOSH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ACOSH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ACOSH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ACOSH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ASIN(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ASIN(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ASIN(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ASIN(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ASINH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ASINH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ASINH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ASINH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ATAN(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ATAN(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ATAN(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ATAN(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" and store the results in "dst" expressed in radians. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ATANH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" expressed in radians using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ATANH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ATANH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ATANH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := COS(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := COS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := COSD(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := COSD(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := COSD(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := COSD(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := COSH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := COSH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := COSH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := COSH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SIN(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SIN(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SINH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SINH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SINH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SINH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SIND(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SIND(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SIND(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SIND(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := TAN(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := TAN(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TAN(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := TAN(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := TAND(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := TAND(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TAND(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := TAND(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := TANH(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := TANH(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TANH(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := TANH(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 -cos_res[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) - ELSE - dst[i+63:i] := sin_src[i+63:i] - MEM[mem_addr+i+63:mem_addr+i] := cos_src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 -cos_res[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 -cos_res[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) - ELSE - dst[i+31:i] := sin_src[i+31:i] - MEM[mem_addr+i+31:mem_addr+i] := cos_src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 -cos_res[MAX:512] := 0 - - AVX512F -
immintrin.h
- Trigonometry -
- - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := CubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := CubeRoot(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := CubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := CubeRoot(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(10.0, a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POW(10.0, a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POW(2.0, a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POW(e, a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := InvSQRT(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := InvSQRT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := InvSQRT(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := InvSQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(1.0 + a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LOG(1.0 + a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(1.0 + a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LOG(1.0 + a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LOG(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Computes the reciprocal of packed double-precision (64-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Computes the reciprocal of packed single-precision (32-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := CDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := CDFNormal(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := CDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := CDFNormal(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ERF(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ERF(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ERF(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ERF(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Probability/Statistics -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := CEIL(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := CEIL(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FLOOR(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := FLOOR(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := NearbyInt(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Rounds each packed double-precision (64-bit) floating-point element in "a" to the nearest integer value and stores the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := NearbyInt(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed single-precision floating-point elements in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := NearbyInt(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Rounds each packed single-precision (32-bit) floating-point element in "a" to the nearest integer value and stores the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := NearbyInt(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RoundToNearestEven(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Rounds the packed double-precision (64-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundToNearestEven(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RoundToNearestEven(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Rounds the packed single-precision (32-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundToNearestEven(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ROUND(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := TRUNCATE(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := TRUNCATE(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := TRUNCATE(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := TRUNCATE(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed signed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 63 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 31 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 7 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 63 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 31 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 63 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 31 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 7 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 63 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 31 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ABS(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ABS(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] :=0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst". - -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (32*imm8[2:0]) -dst[255:0] := temp[255:0] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (32*imm8[2:0]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 32 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (32*imm8[2:0]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst". - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (32*imm8[1:0]) -dst[127:0] := temp[127:0] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (32*imm8[1:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 16 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (32*imm8[1:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst". - -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (64*imm8[1:0]) -dst[255:0] := temp[255:0] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (64*imm8[1:0]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := temp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate "a" and "b" into a 64-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 32 bytes (4 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -temp[511:256] := a[255:0] -temp[255:0] := b[255:0] -temp[511:0] := temp[511:0] >> (64*imm8[1:0]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := temp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst". - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (64*imm8[0]) -dst[127:0] := temp[127:0] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (64*imm8[0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := temp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate "a" and "b" into a 32-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 16 bytes (2 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -temp[255:128] := a[127:0] -temp[127:0] := b[127:0] -temp[255:0] := temp[255:0] >> (64*imm8[0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := temp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -ESAC -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN: j := 0 - SNAN_TOKEN: j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[255:0] := a[255:0] -CASE (imm8[0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -ESAC -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[255:0] := a[255:0] -CASE (imm8[0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -ESAC -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 32-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - off := idx[i+2:i]*32 - dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - IF k[j] - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - off := idx[i+1:i]*32 - dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - off := idx[i+1:i]*64 - dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:256] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - IF k[j] - dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 64-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - off := idx[i]*64 - dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:128] := 0 - - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - dst[i+63:i] := a[id+63:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". - -FOR j := 0 to 7 - i := j*32 - id := idx[i+2:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" across lanes lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 64-bit integers in "a" across lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - id := idx[i+1:i]*64 - dst[i+63:i] := a[id+63:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst.m128[0] := a.m128[imm8[0]] -tmp_dst.m128[1] := b.m128[imm8[1]] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". - -dst.m128[0] := a.m128[imm8[0]] -dst.m128[1] := b.m128[imm8[1]] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 - i := j*64 - k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*32 - k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 3 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 1 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 7 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 3 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 3 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 1 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 7 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 7 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 3 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 3 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 3 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 3 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:4] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 1 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 1 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:2] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Compare -
- - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 7 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 3 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 3 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 1 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 4 packed 64-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 8 packed 32-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 2 packed 64-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 4 packed 32-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512F - AVX512VL -
immintrin.h
- Store -
- - - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := src[m+63:m] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := src[m+63:m] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 7 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 7 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 7 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 7 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int32_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+31:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[k+31:k] := Truncate32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[k+31:k] := Truncate32(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+63:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+31:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+63:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+63:i]) -ENDFOR -dst[MAX:16] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:16] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:16] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[k+31:k] := Saturate32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[k+31:k] := Saturate32(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+63:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - l := j*16 - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - l := j*16 - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Sign extend packed 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+31:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+31:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+63:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+63:i]) -ENDFOR -dst[MAX:16] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:16] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:16] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 32*j - dst[k+31:k] := SaturateU32(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[k+31:k] := SaturateU32(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := 64*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+63:i]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) - FI -ENDFOR - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in th elow 4 bytes of "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in the low 2 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 16-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 16-bit integers in the low 4 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Convert -
- - - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 3 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 1 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 4 packed 64-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 8 packed 32-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 2 packed 64-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 4 packed 32-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Load -
- - - - - - Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Move -
- - - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 7 - i := j*32 - IF k[j] - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 7 - i := j*32 - IF k[j] - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 7 - i := j*32 - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 3 - i := j*32 - IF k[j] - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 3 - i := j*32 - IF k[j] - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 3 - i := j*32 - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 3 - i := j*64 - IF k[j] - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 3 - i := j*64 - IF k[j] - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 3 - i := j*64 - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 1 - i := j*64 - IF k[j] - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 1 - i := j*64 - IF k[j] - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 1 - i := j*64 - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Logical -
- - - - - - Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - - Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - - Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - - Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Set -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Shift -
- - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - - Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := ShiftRows(a[i+127:i]) - a[i+127:i] := SubBytes(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F - VAES -
immintrin.h
- Cryptography -
- - - - - Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := ShiftRows(a[i+127:i]) - a[i+127:i] := SubBytes(a[i+127:i]) - a[i+127:i] := MixColumns(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F - VAES -
immintrin.h
- Cryptography -
- - - - - Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := InvShiftRows(a[i+127:i]) - a[i+127:i] := InvSubBytes(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F - VAES -
immintrin.h
- Cryptography -
- - - - - Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". - FOR j := 0 to 3 - i := j*128 - a[i+127:i] := InvShiftRows(a[i+127:i]) - a[i+127:i] := InvSubBytes(a[i+127:i]) - a[i+127:i] := InvMixColumns(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F - VAES -
immintrin.h
- Cryptography -
- - - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := a[63:0] + b[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] + b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := a[31:0] + b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] + b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - dst[i+63:i] := a[i+63:i] / b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", =and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := 64*j - dst[i+63:i] := a[i+63:i] / b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := 64*j - IF k[j] - dst[i+63:i] := a[i+63:i] / b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := a[i+31:i] / b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := a[i+31:i] / b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := a[i+31:i] / b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := a[63:0] / b[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] / b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] / b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] / b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] / b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := a[31:0] / b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] / b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] / b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] / b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] / b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "a" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "c" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := c[63:0] -FI -dst[127:64] := c[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := c[31:0] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] * b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := a[63:0] * b[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] * b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := a[31:0] * b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+31:i] * b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := a[63:0] - b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := a[63:0] - b[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := a[31:0] - b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := a[31:0] - b[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 16-bit mask from "a" into memory. - -MEM[mem_addr+15:mem_addr] := a[15:0] - - - AVX512F -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits of integer data from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits of integer data from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store the lower double-precision (64-bit) floating-point element from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - MEM[mem_addr+63:mem_addr] := a[63:0] -FI - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store the lower single-precision (32-bit) floating-point element from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - MEM[mem_addr+31:mem_addr] := a[31:0] -FI - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 32 -m := base_addr -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 64 -m := base_addr -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter 64-bit integers from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter 32-bit integers from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter 64-bit integers from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 32-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter double-precision (64-bit) floating-point elements from "a" into memory using 64-bit indices. 64-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 64-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Store -
- - - - - - - Multiplies elements in packed 64-bit integer vectors "a" and "b" together, storing the lower 64 bits of the result in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Store -
- - - - Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 16-bit mask from memory into "k". - -k[15:0] := MEM[mem_addr+15:mem_addr] - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits of integer data from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed 32-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed 64-bit integers from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits of integer data from memory into "dst" using a non-temporal memory hint. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - dst[63:0] := MEM[mem_addr+63:mem_addr] -ELSE - dst[63:0] := src[63:0] -FI -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper element of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - dst[63:0] := MEM[mem_addr+63:mem_addr] -ELSE - dst[63:0] := 0 -FI -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - dst[31:0] := MEM[mem_addr+31:mem_addr] -ELSE - dst[31:0] := src[31:0] -FI -dst[MAX:32] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load a single-precision (32-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -IF k[0] - dst[31:0] := MEM[mem_addr+31:mem_addr] -ELSE - dst[31:0] := 0 -FI -dst[MAX:32] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed double-precision (64-bit) floating-point elements from memoy into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active 32-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active 64-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at "base_addr" and offset by each 64-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 7 - i := j*64 - m := j*64 - IF k[j] - addr := base_addr + vindex[m+63:m] * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] AND b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". - -k[15:0] := (NOT a[15:0]) AND b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". - -k[15:0] := NOT a[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] OR b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := NOT (a[15:0] XOR b[15:0]) -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] XOR b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Shift the bits of 16-bit mask "a" left by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 15 - k[15:0] := a[15:0] << count[7:0] -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - - Shift the bits of 16-bit mask "a" right by "count" while shifting in zeros, and store the least significant 16 bits of the result in "k". - -k[MAX:0] := 0 -IF count[7:0] <= 15 - k[15:0] := a[15:0] >> count[7:0] -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - - - Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeros, store 1 in "dst", otherwise store 0 in "dst". If the result is all ones, store 1 in "all_ones", otherwise store 0 in "all_ones". - -tmp[15:0] := a[15:0] OR b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI -IF tmp[15:0] == 0xFFFF - MEM[all_ones+7:all_ones] := 1 -ELSE - MEM[all_ones+7:all_ones] := 0 -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all zeroes, store 1 in "dst", otherwise store 0 in "dst". - -tmp[15:0] := a[15:0] OR b[15:0] -IF tmp[15:0] == 0x0 - dst := 1 -ELSE - dst := 0 -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 16-bit masks "a" and "b". If the result is all ones, store 1 in "dst", otherwise store 0 in "dst". - -tmp[15:0] := a[15:0] OR b[15:0] -IF tmp[15:0] == 0xFFFF - dst := 1 -ELSE - dst := 0 -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - Convert 16-bit mask "a" into an integer value, and store the result in "dst". - -dst := ZeroExtend32(a[15:0]) - - - AVX512F -
immintrin.h
- Mask -
- - - - Convert integer value "a" into an 16-bit mask, and store the result in "k". - -k := ZeroExtend16(a[15:0]) - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise NOT of 16-bit masks "a" and then AND with "b", and store the result in "k". - -k[15:0] := (NOT a[15:0]) AND b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise AND of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] AND b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - Copy 16-bit mask "a" to "k". - -k[15:0] := a[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - Compute the bitwise NOT of 16-bit mask "a", and store the result in "k". - -k[15:0] := NOT a[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise OR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] OR b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Unpack and interleave 8 bits from masks "a" and "b", and store the 16-bit result in "k". - -k[7:0] := b[7:0] -k[15:8] := a[7:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise XNOR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := NOT (a[15:0] XOR b[15:0]) -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Compute the bitwise XOR of 16-bit masks "a" and "b", and store the result in "k". - -k[15:0] := a[15:0] XOR b[15:0] -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Mask -
- - - - - Performs bitwise OR between "k1" and "k2", storing the result in "dst". ZF flag is set if "dst" is 0. - dst[15:0] := k1[15:0] | k2[15:0] -IF dst == 0 - SetZF() -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - - Performs bitwise OR between "k1" and "k2", storing the result in "dst". CF flag is set if "dst" consists of all 1's. - dst[15:0] := k1[15:0] | k2[15:0] -IF PopCount(dst[15:0]) == 16 - SetCF() -FI - - - AVX512F -
immintrin.h
- Mask -
- - - - Converts bit mask "k1" into an integer value, storing the results in "dst". - -dst := ZeroExtend32(k1) - - - AVX512F -
immintrin.h
- Mask -
- - - - Converts integer "mask" into bitmask, storing the result in "dst". - -dst := mask[15:0] - - - AVX512F -
immintrin.h
- Mask -
- - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and stores the low 64 bytes (16 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst". - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) -dst[511:0] := temp[511:0] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and store the low 64 bytes (8 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := temp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 64-bit elements, and stores the low 64 bytes (8 elements) in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (64*imm8[2:0]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := temp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up packed double-precision (64-bit) floating-point elements in "a" and "b" using packed 64-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up packed single-precision (32-bit) floating-point elements in "a" and "b" using packed 32-bit integers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -dst[127:64] := b[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst", and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -dst[127:64] := b[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := b[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -ELSE - dst[63:0] := a[63:0] -FI -dst[127:64] := b[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := b[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up the lower double-precision (64-bit) floating-point elements in "a" and "b" using the lower 64-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "b" to the upper element of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]) { - tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0] - CASE(tsrc[63:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[63:0] := src1[63:0] - 1 : dest[63:0] := tsrc[63:0] - 2 : dest[63:0] := QNaN(tsrc[63:0]) - 3 : dest[63:0] := QNAN_Indefinite - 4 : dest[63:0] := -INF - 5 : dest[63:0] := +INF - 6 : dest[63:0] := tsrc.sign? -INF : +INF - 7 : dest[63:0] := -0 - 8 : dest[63:0] := +0 - 9 : dest[63:0] := -1 - 10: dest[63:0] := +1 - 11: dest[63:0] := 1/2 - 12: dest[63:0] := 90.0 - 13: dest[63:0] := PI/2 - 14: dest[63:0] := MAX_FLOAT - 15: dest[63:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[63:0] -} -IF k[0] - dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := b[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -dst[127:32] := b[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst", and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -dst[127:32] := b[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := b[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -ELSE - dst[31:0] := a[31:0] -FI -dst[127:32] := b[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - [sae_note] - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := b[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Fix up the lower single-precision (32-bit) floating-point elements in "a" and "b" using the lower 32-bit integer in "c", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "b" to the upper elements of "dst". "imm8" is used to set the required flags reporting. - enum TOKEN_TYPE { - QNAN_TOKEN := 0, \ - SNAN_TOKEN := 1, \ - ZERO_VALUE_TOKEN := 2, \ - ONE_VALUE_TOKEN := 3, \ - NEG_INF_TOKEN := 4, \ - POS_INF_TOKEN := 5, \ - NEG_VALUE_TOKEN := 6, \ - POS_VALUE_TOKEN := 7 -} -DEFINE FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]) { - tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0] - CASE(tsrc[31:0]) OF - QNAN_TOKEN:j := 0 - SNAN_TOKEN:j := 1 - ZERO_VALUE_TOKEN: j := 2 - ONE_VALUE_TOKEN: j := 3 - NEG_INF_TOKEN: j := 4 - POS_INF_TOKEN: j := 5 - NEG_VALUE_TOKEN: j := 6 - POS_VALUE_TOKEN: j := 7 - ESAC - - token_response[3:0] := src3[3+4*j:4*j] - - CASE(token_response[3:0]) OF - 0 : dest[31:0] := src1[31:0] - 1 : dest[31:0] := tsrc[31:0] - 2 : dest[31:0] := QNaN(tsrc[31:0]) - 3 : dest[31:0] := QNAN_Indefinite - 4 : dest[31:0] := -INF - 5 : dest[31:0] := +INF - 6 : dest[31:0] := tsrc.sign? -INF : +INF - 7 : dest[31:0] := -0 - 8 : dest[31:0] := +0 - 9 : dest[31:0] := -1 - 10: dest[31:0] := +1 - 11: dest[31:0] := 1/2 - 12: dest[31:0] := 90.0 - 13: dest[31:0] := PI/2 - 14: dest[31:0] := MAX_FLOAT - 15: dest[31:0] := -MAX_FLOAT - ESAC - - CASE(tsrc[31:0]) OF - ZERO_VALUE_TOKEN: - IF (imm8[0]) #ZE; FI - ZERO_VALUE_TOKEN: - IF (imm8[1]) #IE; FI - ONE_VALUE_TOKEN: - IF (imm8[2]) #ZE; FI - ONE_VALUE_TOKEN: - IF (imm8[3]) #IE; FI - SNAN_TOKEN: - IF (imm8[4]) #IE; FI - NEG_INF_TOKEN: - IF (imm8[5]) #IE; FI - NEG_VALUE_TOKEN: - IF (imm8[6]) #IE; FI - POS_INF_TOKEN: - IF (imm8[7]) #IE; FI - ESAC - RETURN dest[31:0] -} -IF k[0] - dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := b[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - dst[63:0] := ConvertExpFP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - dst[63:0] := ConvertExpFP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of the lower double-precision (64-bit) floating-point element in "b" to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[63:0] := ConvertExpFP64(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - dst[31:0] := ConvertExpFP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - dst[31:0] := ConvertExpFP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - [sae_note] - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of the lower single-precision (32-bit) floating-point element in "b" to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst[31:0] := ConvertExpFP32(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[63:0] := GetNormalizedMantissa(b[63:0], sc, interv) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - IF k[0] - dst[31:0] := GetNormalizedMantissa(b[31:0], sc, interv) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Round packed double-precision (64-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RoundScaleFP64(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Round packed single-precision (32-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RoundScaleFP32(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -IF k[0] - dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Round the lower double-precision (64-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [round_imm_note] - -DEFINE RoundScaleFP64(src1[63:0], imm8[7:0]) { - m[63:0] := FP64(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[63:0] := POW(2.0, -m) * ROUND(POW(2.0, m) * src1[63:0], imm8[3:0]) - IF IsInf(tmp[63:0]) - tmp[63:0] := src1[63:0] - FI - RETURN tmp[63:0] -} -dst[63:0] := RoundScaleFP64(b[63:0], imm8[7:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -IF k[0] - dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Round the lower single-precision (32-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE RoundScaleFP32(src1[31:0], imm8[7:0]) { - m[31:0] := FP32(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[31:0] := POW(FP32(2.0), -m) * ROUND(POW(FP32(2.0), m) * src1[31:0], imm8[3:0]) - IF IsInf(tmp[31:0]) - tmp[31:0] := src1[31:0] - FI - RETURN tmp[31:0] -} -dst[31:0] := RoundScaleFP32(b[31:0], imm8[7:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[31:0] -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -IF k[0] - dst[63:0] := SCALE(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -dst[63:0] := SCALE(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed double-precision (64-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[63:0] := tmp_src1[63:0] * POW(2.0, FLOOR(tmp_src2[63:0])) - RETURN dst[63:0] -} -dst[63:0] := SCALE(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -IF k[0] - dst[31:0] := SCALE(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -dst[31:0] := SCALE(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - DEFINE SCALE(src1, src2) { - IF (src2 == NaN) - IF (src2 == SNaN) - RETURN QNAN(src2) - FI - ELSE IF (src1 == NaN) - IF (src1 == SNaN) - RETURN QNAN(src1) - FI - IF (src2 != INF) - RETURN QNAN(src1) - FI - ELSE - tmp_src2 := src2 - tmp_src1 := src1 - IF (IS_DENORMAL(src2) AND MXCSR.DAZ) - tmp_src2 := 0 - FI - IF (IS_DENORMAL(src1) AND MXCSR.DAZ) - tmp_src1 := 0 - FI - FI - dst[31:0] := tmp_src1[31:0] * POW(2.0, FLOOR(tmp_src2[31:0])) - RETURN dst[63:0] -} -dst[31:0] := SCALE(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the 4 packed single-precision (32-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the 4 packed double-precision (64-bit) floating-point elements from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - dst[i+31:i] := a[n+31:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the 4 packed 32-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - n := (j % 4)*32 - IF k[j] - dst[i+31:i] := a[n+31:n] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - dst[i+63:i] := a[n+63:n] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the 4 packed 64-bit integers from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - n := (j % 4)*64 - IF k[j] - dst[i+63:i] := a[n+63:n] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the low double-precision (64-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the low single-precision (32-bit) floating-point element from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active double-precision (64-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active single-precision (32-bit) floating-point elements in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Load contiguous active double-precision (64-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Load contiguous active single-precision (32-bit) floating-point elements from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[1:0] OF -0: dst[127:0] := a[127:0] -1: dst[127:0] := a[255:128] -2: dst[127:0] := a[383:256] -3: dst[127:0] := a[511:384] -ESAC -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Extract 128 bits (composed of 4 packed 32-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[1:0] OF -0: tmp[127:0] := a[127:0] -1: tmp[127:0] := a[255:128] -2: tmp[127:0] := a[383:256] -3: tmp[127:0] := a[511:384] -ESAC -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the result in "dst". - -CASE imm8[0] OF -0: dst[255:0] := a[255:0] -1: dst[255:0] := a[511:256] -ESAC -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Extract 256 bits (composed of 4 packed 64-bit integers) from "a", selected with "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -CASE imm8[0] OF -0: tmp[255:0] := a[255:0] -1: tmp[255:0] := a[511:256] -ESAC -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -2: dst[383:256] := b[127:0] -3: dst[511:384] := b[127:0] -ESAC -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[0]) OF -0: dst[255:0] := b[255:0] -1: dst[511:256] := b[255:0] -ESAC -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: dst[127:0] := b[127:0] -1: dst[255:128] := b[127:0] -2: dst[383:256] := b[127:0] -3: dst[511:384] := b[127:0] -ESAC -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Copy "a" to "tmp", then insert 128 bits (composed of 4 packed 32-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[1:0]) OF -0: tmp[127:0] := b[127:0] -1: tmp[255:128] := b[127:0] -2: tmp[383:256] := b[127:0] -3: tmp[511:384] := b[127:0] -ESAC -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "dst" at the location specified by "imm8". - -dst[511:0] := a[511:0] -CASE (imm8[0]) OF -0: dst[255:0] := b[255:0] -1: dst[511:256] := b[255:0] -ESAC -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Copy "a" to "tmp", then insert 256 bits (composed of 4 packed 64-bit integers) from "b" into "tmp" at the location specified by "imm8". Store "tmp" to "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[511:0] := a[511:0] -CASE (imm8[0]) OF -0: tmp[255:0] := b[255:0] -1: tmp[511:256] := b[255:0] -ESAC -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the low packed 32-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Broadcast the low packed 64-bit integer from "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 32 -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 32-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 32 -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[m+size-1:m] := a[i+31:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 64 -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 64-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 64 -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[m+size-1:m] := a[i+63:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 32-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set) - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := idx[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - off := idx[i+3:i]*32 - dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off] -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "idx" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := idx[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 64-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - off := idx[i+2:i]*64 - dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off] -ENDFOR -dst[MAX:512] := 0 - - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -IF (b[1] == 0) tmp_dst[63:0] := a[63:0]; FI -IF (b[1] == 1) tmp_dst[63:0] := a[127:64]; FI -IF (b[65] == 0) tmp_dst[127:64] := a[63:0]; FI -IF (b[65] == 1) tmp_dst[127:64] := a[127:64]; FI -IF (b[129] == 0) tmp_dst[191:128] := a[191:128]; FI -IF (b[129] == 1) tmp_dst[191:128] := a[255:192]; FI -IF (b[193] == 0) tmp_dst[255:192] := a[191:128]; FI -IF (b[193] == 1) tmp_dst[255:192] := a[255:192]; FI -IF (b[257] == 0) tmp_dst[319:256] := a[319:256]; FI -IF (b[257] == 1) tmp_dst[319:256] := a[383:320]; FI -IF (b[321] == 0) tmp_dst[383:320] := a[319:256]; FI -IF (b[321] == 1) tmp_dst[383:320] := a[383:320]; FI -IF (b[385] == 0) tmp_dst[447:384] := a[447:384]; FI -IF (b[385] == 1) tmp_dst[447:384] := a[511:448]; FI -IF (b[449] == 0) tmp_dst[511:448] := a[447:384]; FI -IF (b[449] == 1) tmp_dst[511:448] := a[511:448]; FI -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -IF (imm8[0] == 0) dst[63:0] := a[63:0]; FI -IF (imm8[0] == 1) dst[63:0] := a[127:64]; FI -IF (imm8[1] == 0) dst[127:64] := a[63:0]; FI -IF (imm8[1] == 1) dst[127:64] := a[127:64]; FI -IF (imm8[2] == 0) dst[191:128] := a[191:128]; FI -IF (imm8[2] == 1) dst[191:128] := a[255:192]; FI -IF (imm8[3] == 0) dst[255:192] := a[191:128]; FI -IF (imm8[3] == 1) dst[255:192] := a[255:192]; FI -IF (imm8[4] == 0) dst[319:256] := a[319:256]; FI -IF (imm8[4] == 1) dst[319:256] := a[383:320]; FI -IF (imm8[5] == 0) dst[383:320] := a[319:256]; FI -IF (imm8[5] == 1) dst[383:320] := a[383:320]; FI -IF (imm8[6] == 0) dst[447:384] := a[447:384]; FI -IF (imm8[6] == 1) dst[447:384] := a[511:448]; FI -IF (imm8[7] == 0) dst[511:448] := a[447:384]; FI -IF (imm8[7] == 1) dst[511:448] := a[511:448]; FI -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - -IF (b[1] == 0) dst[63:0] := a[63:0]; FI -IF (b[1] == 1) dst[63:0] := a[127:64]; FI -IF (b[65] == 0) dst[127:64] := a[63:0]; FI -IF (b[65] == 1) dst[127:64] := a[127:64]; FI -IF (b[129] == 0) dst[191:128] := a[191:128]; FI -IF (b[129] == 1) dst[191:128] := a[255:192]; FI -IF (b[193] == 0) dst[255:192] := a[191:128]; FI -IF (b[193] == 1) dst[255:192] := a[255:192]; FI -IF (b[257] == 0) dst[319:256] := a[319:256]; FI -IF (b[257] == 1) dst[319:256] := a[383:320]; FI -IF (b[321] == 0) dst[383:320] := a[319:256]; FI -IF (b[321] == 1) dst[383:320] := a[383:320]; FI -IF (b[385] == 0) dst[447:384] := a[447:384]; FI -IF (b[385] == 1) dst[447:384] := a[511:448]; FI -IF (b[449] == 0) dst[511:448] := a[447:384]; FI -IF (b[449] == 1) dst[511:448] := a[511:448]; FI -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) -tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) -tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) -tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) -tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) -tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) -tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) -tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) -tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], b[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], b[33:32]) -tmp_dst[95:64] := SELECT4(a[127:0], b[65:64]) -tmp_dst[127:96] := SELECT4(a[127:0], b[97:96]) -tmp_dst[159:128] := SELECT4(a[255:128], b[129:128]) -tmp_dst[191:160] := SELECT4(a[255:128], b[161:160]) -tmp_dst[223:192] := SELECT4(a[255:128], b[193:192]) -tmp_dst[255:224] := SELECT4(a[255:128], b[225:224]) -tmp_dst[287:256] := SELECT4(a[383:256], b[257:256]) -tmp_dst[319:288] := SELECT4(a[383:256], b[289:288]) -tmp_dst[351:320] := SELECT4(a[383:256], b[321:320]) -tmp_dst[383:352] := SELECT4(a[383:256], b[353:352]) -tmp_dst[415:384] := SELECT4(a[511:384], b[385:384]) -tmp_dst[447:416] := SELECT4(a[511:384], b[417:416]) -tmp_dst[479:448] := SELECT4(a[511:384], b[449:448]) -tmp_dst[511:480] := SELECT4(a[511:384], b[481:480]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], b[1:0]) -dst[63:32] := SELECT4(a[127:0], b[33:32]) -dst[95:64] := SELECT4(a[127:0], b[65:64]) -dst[127:96] := SELECT4(a[127:0], b[97:96]) -dst[159:128] := SELECT4(a[255:128], b[129:128]) -dst[191:160] := SELECT4(a[255:128], b[161:160]) -dst[223:192] := SELECT4(a[255:128], b[193:192]) -dst[255:224] := SELECT4(a[255:128], b[225:224]) -dst[287:256] := SELECT4(a[383:256], b[257:256]) -dst[319:288] := SELECT4(a[383:256], b[289:288]) -dst[351:320] := SELECT4(a[383:256], b[321:320]) -dst[383:352] := SELECT4(a[383:256], b[353:352]) -dst[415:384] := SELECT4(a[511:384], b[385:384]) -dst[447:416] := SELECT4(a[511:384], b[417:416]) -dst[479:448] := SELECT4(a[511:384], b[449:448]) -dst[511:480] := SELECT4(a[511:384], b[481:480]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle double-precision (64-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - dst[i+63:i] := a[id+63:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle single-precision (32-bit) floating-point elements in "a" across lanes using the corresponding index in "idx". - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - IF k[j] - dst[i+63:i] := a[id+63:id] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle 64-bit integers in "a" within 256-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[63:0] := src[63:0] - 1: tmp[63:0] := src[127:64] - 2: tmp[63:0] := src[191:128] - 3: tmp[63:0] := src[255:192] - ESAC - RETURN tmp[63:0] -} -dst[63:0] := SELECT4(a[255:0], imm8[1:0]) -dst[127:64] := SELECT4(a[255:0], imm8[3:2]) -dst[191:128] := SELECT4(a[255:0], imm8[5:4]) -dst[255:192] := SELECT4(a[255:0], imm8[7:6]) -dst[319:256] := SELECT4(a[511:256], imm8[1:0]) -dst[383:320] := SELECT4(a[511:256], imm8[3:2]) -dst[447:384] := SELECT4(a[511:256], imm8[5:4]) -dst[511:448] := SELECT4(a[511:256], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle 64-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - id := idx[i+2:i]*64 - dst[i+63:i] := a[id+63:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Load contiguous active 32-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[m+31:m] - m := m + 32 - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Load contiguous active 64-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[m+63:m] - m := m + 64 - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 64-bit integers from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave 64-bit integers from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of 4 32-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle 128-bits (composed of 2 64-bit integers) selected by "imm8" from "a" and "b", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[127:0] := src[127:0] - 1: tmp[127:0] := src[255:128] - 2: tmp[127:0] := src[383:256] - 3: tmp[127:0] := src[511:384] - ESAC - RETURN tmp[127:0] -} -dst[127:0] := SELECT4(a[511:0], imm8[1:0]) -dst[255:128] := SELECT4(a[511:0], imm8[3:2]) -dst[383:256] := SELECT4(b[511:0], imm8[5:4]) -dst[511:384] := SELECT4(b[511:0], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] -tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] -tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] -tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] -tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] -tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] -tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in "imm8", and store the results in "dst". - -dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] -dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192] -dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192] -dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320] -dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320] -dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448] -dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -dst[127:96] := SELECT4(b[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(b[255:128], imm8[5:4]) -dst[255:224] := SELECT4(b[255:128], imm8[7:6]) -dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -dst[351:320] := SELECT4(b[383:256], imm8[5:4]) -dst[383:352] := SELECT4(b[383:256], imm8[7:6]) -dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -dst[479:448] := SELECT4(b[511:384], imm8[5:4]) -dst[511:480] := SELECT4(b[511:384], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp_dst[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) -dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128]) -dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256]) -dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0 - - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := src[m+63:m] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - m := j*64 - IF k[j] - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) - ELSE - dst[m+63:m] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] - -FOR j := 0 to 15 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [round2_note] - -FOR j := 0 to 15 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". [round2_note] - -FOR j := 0 to 15 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round2_note] - -FOR j := 0 to 15 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round2_note] - -FOR j := 0 to 15 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round2_note] - -FOR j := 0 to 15 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round2_note] - -FOR j := 0 to 15 - i := 16*j - l := 32*j - IF k[j] - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_UInt32(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_UInt64(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [sae_note] - -dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [sae_note] - -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [sae_note] - -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - [round_note] - -dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - [round_note] - -dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_UInt32(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_UInt64(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[k+63:k]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 32*j - l := 64*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[l+63:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP32_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (32-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_FP64_To_UInt32_Truncate(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_UInt32_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_UInt64_Truncate(a[63:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - [sae_note] - -dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_UInt32_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_UInt64_Truncate(a[31:0]) - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - l := j*32 - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[i+63:i] := Convert_Int64_To_FP64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - IF k[j] - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the unsigned 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the unsigned 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the unsigned 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert the unsigned 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+31:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+31:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[k+7:k] := Truncate8(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Truncate8(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 8-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Truncate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[k+31:k] := Truncate32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Truncate32(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Truncate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[k+15:k] := Truncate16(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Truncate16(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed 64-bit integers in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Truncate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+31:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+31:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[k+7:k] := Saturate8(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := Saturate8(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 8-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := Saturate8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[k+31:k] := Saturate32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := Saturate32(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 32-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := Saturate32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[k+15:k] := Saturate16(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := Saturate16(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed 16-bit integers with signed saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := Saturate16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[i+31:i] := SignExtend32(a[k+7:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[i+63:i] := SignExtend64(a[k+7:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := SignExtend64(a[k+31:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[i+31:i] := SignExtend32(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - l := j*16 - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := SignExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[i+63:i] := SignExtend64(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := SignExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+31:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+31:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 32-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+31:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+31:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[k+7:k] := SaturateU8(a[i+63:i]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := src[l+7:l] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - MEM[base_addr+l+7:base_addr+l] := SaturateU8(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 8-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[l+7:l] := SaturateU8(a[i+63:i]) - ELSE - dst[l+7:l] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[k+31:k] := SaturateU32(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - MEM[base_addr+l+31:base_addr+l] := SaturateU32(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 32-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[l+31:l] := SaturateU32(a[i+63:i]) - ELSE - dst[l+31:l] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[k+15:k] := SaturateU16(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := src[l+15:l] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - Store - - - - - Convert packed unsigned 64-bit integers in "a" to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - MEM[base_addr+l+15:base_addr+l] := SaturateU16(a[i+63:i]) - FI -ENDFOR - - - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed unsigned 16-bit integers with unsigned saturation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[l+15:l] := SaturateU16(a[i+63:i]) - ELSE - dst[l+15:l] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 8*j - dst[i+31:i] := ZeroExtend32(a[k+7:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 8*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+7:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 8*j - dst[i+63:i] := ZeroExtend64(a[k+7:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 8*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+7:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 32*j - dst[i+63:i] := ZeroExtend64(a[k+31:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 32*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+31:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 15 - i := 32*j - k := 16*j - dst[i+31:i] := ZeroExtend32(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := 32*j - l := 16*j - IF k[j] - dst[i+31:i] := ZeroExtend32(a[l+15:l]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := 64*j - k := 16*j - dst[i+63:i] := ZeroExtend64(a[k+15:k]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := 64*j - l := 16*j - IF k[j] - dst[i+63:i] := ZeroExtend64(a[l+15:l]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". - -dst[31:0] := a[31:0] - - - AVX512F -
immintrin.h
- Convert -
- - - - Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". - -dst[63:0] := a[63:0] - - - AVX512F -
immintrin.h
- Convert -
- - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] - -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] - -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := MAX(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note][max_float_note] - -dst[63:0] := MAX(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] - -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] - -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := MAX(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] - -dst[31:0] := MAX(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note][min_float_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note][min_float_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := MIN(a[63:0], b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" , and copy the upper element from "a" to the upper element of "dst". [sae_note][min_float_note] - -dst[63:0] := MIN(a[63:0], b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := MIN(a[31:0], b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] - -dst[31:0] := MIN(a[31:0], b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ABS(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ABS(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compute the absolute value of packed signed 64-bit integers in "a", and store the unsigned results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 64-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 64-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Move packed double-precision (64-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - Move packed single-precision (32-bit) floating-point elements from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] -tmp[319:256] := a[319:256] -tmp[383:320] := a[319:256] -tmp[447:384] := a[447:384] -tmp[511:448] := a[447:384] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[63:0] := a[63:0] -tmp[127:64] := a[63:0] -tmp[191:128] := a[191:128] -tmp[255:192] := a[191:128] -tmp[319:256] := a[319:256] -tmp[383:320] := a[319:256] -tmp[447:384] := a[447:384] -tmp[511:448] := a[447:384] -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := tmp[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - Duplicate even-indexed double-precision (64-bit) floating-point elements from "a", and store the results in "dst". - -dst[63:0] := a[63:0] -dst[127:64] := a[63:0] -dst[191:128] := a[191:128] -dst[255:192] := a[191:128] -dst[319:256] := a[319:256] -dst[383:320] := a[319:256] -dst[447:384] := a[447:384] -dst[511:448] := a[447:384] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - Move packed 32-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - Move packed 64-bit integers from "a" into "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - - Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := b[63:0] -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := b[63:0] -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -tmp[287:256] := a[319:288] -tmp[319:288] := a[319:288] -tmp[351:320] := a[383:352] -tmp[383:352] := a[383:352] -tmp[415:384] := a[447:416] -tmp[447:416] := a[447:416] -tmp[479:448] := a[511:480] -tmp[511:480] := a[511:480] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[63:32] -tmp[63:32] := a[63:32] -tmp[95:64] := a[127:96] -tmp[127:96] := a[127:96] -tmp[159:128] := a[191:160] -tmp[191:160] := a[191:160] -tmp[223:192] := a[255:224] -tmp[255:224] := a[255:224] -tmp[287:256] := a[319:288] -tmp[319:288] := a[319:288] -tmp[351:320] := a[383:352] -tmp[383:352] := a[383:352] -tmp[415:384] := a[447:416] -tmp[447:416] := a[447:416] -tmp[479:448] := a[511:480] -tmp[511:480] := a[511:480] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := a[63:32] -dst[95:64] := a[127:96] -dst[127:96] := a[127:96] -dst[159:128] := a[191:160] -dst[191:160] := a[191:160] -dst[223:192] := a[255:224] -dst[255:224] := a[255:224] -dst[287:256] := a[319:288] -dst[319:288] := a[319:288] -dst[351:320] := a[383:352] -dst[383:352] := a[383:352] -dst[415:384] := a[447:416] -dst[447:416] := a[447:416] -dst[479:448] := a[511:480] -dst[511:480] := a[511:480] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] -tmp[287:256] := a[287:256] -tmp[319:288] := a[287:256] -tmp[351:320] := a[351:320] -tmp[383:352] := a[351:320] -tmp[415:384] := a[415:384] -tmp[447:416] := a[415:384] -tmp[479:448] := a[479:448] -tmp[511:480] := a[479:448] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -tmp[31:0] := a[31:0] -tmp[63:32] := a[31:0] -tmp[95:64] := a[95:64] -tmp[127:96] := a[95:64] -tmp[159:128] := a[159:128] -tmp[191:160] := a[159:128] -tmp[223:192] := a[223:192] -tmp[255:224] := a[223:192] -tmp[287:256] := a[287:256] -tmp[319:288] := a[287:256] -tmp[351:320] := a[351:320] -tmp[383:352] := a[351:320] -tmp[415:384] := a[415:384] -tmp[447:416] := a[415:384] -tmp[479:448] := a[479:448] -tmp[511:480] := a[479:448] -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := a[31:0] -dst[95:64] := a[95:64] -dst[127:96] := a[95:64] -dst[159:128] := a[159:128] -dst[191:160] := a[159:128] -dst[223:192] := a[223:192] -dst[255:224] := a[223:192] -dst[287:256] := a[287:256] -dst[319:288] := a[287:256] -dst[351:320] := a[351:320] -dst[383:352] := a[351:320] -dst[415:384] := a[415:384] -dst[447:416] := a[415:384] -dst[479:448] := a[479:448] -dst[511:480] := a[479:448] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - - Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := b[31:0] -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := b[31:0] -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 32-bit granularity (32-bit elements are copied from "a" when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 15 - i := j*32 - IF k[j] - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 15 - i := j*32 - IF k[j] - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 32-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 15 - i := j*32 - FOR h := 0 to 31 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using writemask "k" at 64-bit granularity (64-bit elements are copied from "a" when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 7 - i := j*64 - IF k[j] - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst" using zeromask "k" at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 7 - i := j*64 - IF k[j] - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in "imm8". For each bit in each packed 64-bit integer, the corresponding bit from "a", "b", and "c" are used according to "imm8", and the result is written to the corresponding bit in "dst". - -DEFINE TernaryOP(imm8, a, b, c) { - CASE imm8[7:0] OF - 0: dst[0] := 0 // imm8[7:0] := 0 - 1: dst[0] := NOT (a OR b OR c) // imm8[7:0] := NOT (_MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C) - // ... - 254: dst[0] := a OR b OR c // imm8[7:0] := _MM_TERNLOG_A OR _MM_TERNLOG_B OR _MM_TERNLOG_C - 255: dst[0] := 1 // imm8[7:0] := 1 - ESAC -} -imm8[7:0] = LogicExp(_MM_TERNLOG_A, _MM_TERNLOG_B, _MM_TERNLOG_C) -FOR j := 0 to 7 - i := j*64 - FOR h := 0 to 63 - dst[i+h] := TernaryOP(imm8[7:0], a[i+h], b[i+h], c[i+h]) - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 7 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise NAND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 15 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is zero. - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise NAND of packed 64-bit integers in "a" and "b", producing intermediate 64-bit values, and set the corresponding bit in result mask "k" if the intermediate value is zero. - -FOR j := 0 to 7 - i := j*64 - k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - Broadcast 8-bit integer "a" to all elements of "dst". - -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - - - Broadcast 32-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - - Broadcast 32-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[31:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - Broadcast 32-bit integer "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - - - Broadcast 64-bit integer "a" to all elements of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - - Broadcast 64-bit integer "a" to all elements of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[63:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - Broadcast 64-bit integer "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - Broadcast the low packed 16-bit integer from "a" to all all elements of "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed 32-bit integers in "dst" with the repeated 4 element sequence. - -dst[31:0] := a -dst[63:32] := b -dst[95:64] := c -dst[127:96] := d -dst[159:128] := a -dst[191:160] := b -dst[223:192] := c -dst[255:224] := d -dst[287:256] := a -dst[319:288] := b -dst[351:320] := c -dst[383:352] := d -dst[415:384] := a -dst[447:416] := b -dst[479:448] := c -dst[511:480] := d -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed 64-bit integers in "dst" with the repeated 4 element sequence. - -dst[63:0] := a -dst[127:64] := b -dst[191:128] := c -dst[255:192] := d -dst[319:256] := a -dst[383:320] := b -dst[447:384] := c -dst[511:448] := d -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence. - -dst[63:0] := a -dst[127:64] := b -dst[191:128] := c -dst[255:192] := d -dst[319:256] := a -dst[383:320] := b -dst[447:384] := c -dst[511:448] := d -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence. - -dst[31:0] := a -dst[63:32] := b -dst[95:64] := c -dst[127:96] := d -dst[159:128] := a -dst[191:160] := b -dst[223:192] := c -dst[255:224] := d -dst[287:256] := a -dst[319:288] := b -dst[351:320] := c -dst[383:352] := d -dst[415:384] := a -dst[447:416] := b -dst[479:448] := c -dst[511:480] := d -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. - -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 -dst[71:64] := e8 -dst[79:72] := e9 -dst[87:80] := e10 -dst[95:88] := e11 -dst[103:96] := e12 -dst[111:104] := e13 -dst[119:112] := e14 -dst[127:120] := e15 -dst[135:128] := e16 -dst[143:136] := e17 -dst[151:144] := e18 -dst[159:152] := e19 -dst[167:160] := e20 -dst[175:168] := e21 -dst[183:176] := e22 -dst[191:184] := e23 -dst[199:192] := e24 -dst[207:200] := e25 -dst[215:208] := e26 -dst[223:216] := e27 -dst[231:224] := e28 -dst[239:232] := e29 -dst[247:240] := e30 -dst[255:248] := e31 -dst[263:256] := e32 -dst[271:264] := e33 -dst[279:272] := e34 -dst[287:280] := e35 -dst[295:288] := e36 -dst[303:296] := e37 -dst[311:304] := e38 -dst[319:312] := e39 -dst[327:320] := e40 -dst[335:328] := e41 -dst[343:336] := e42 -dst[351:344] := e43 -dst[359:352] := e44 -dst[367:360] := e45 -dst[375:368] := e46 -dst[383:376] := e47 -dst[391:384] := e48 -dst[399:392] := e49 -dst[407:400] := e50 -dst[415:408] := e51 -dst[423:416] := e52 -dst[431:424] := e53 -dst[439:432] := e54 -dst[447:440] := e55 -dst[455:448] := e56 -dst[463:456] := e57 -dst[471:464] := e58 -dst[479:472] := e59 -dst[487:480] := e60 -dst[495:488] := e61 -dst[503:496] := e62 -dst[511:504] := e63 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values. - -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -dst[79:64] := e4 -dst[95:80] := e5 -dst[111:96] := e6 -dst[127:112] := e7 -dst[143:128] := e8 -dst[159:144] := e9 -dst[175:160] := e10 -dst[191:176] := e11 -dst[207:192] := e12 -dst[223:208] := e13 -dst[239:224] := e14 -dst[255:240] := e15 -dst[271:256] := e16 -dst[287:272] := e17 -dst[303:288] := e18 -dst[319:304] := e19 -dst[335:320] := e20 -dst[351:336] := e21 -dst[367:352] := e22 -dst[383:368] := e23 -dst[399:384] := e24 -dst[415:400] := e25 -dst[431:416] := e26 -dst[447:432] := e27 -dst[463:448] := e28 -dst[479:464] := e29 -dst[495:480] := e30 -dst[511:496] := e31 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[287:256] := e8 -dst[319:288] := e9 -dst[351:320] := e10 -dst[383:352] := e11 -dst[415:384] := e12 -dst[447:416] := e13 -dst[479:448] := e14 -dst[511:480] := e15 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - Set packed 64-bit integers in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[319:256] := e4 -dst[383:320] := e5 -dst[447:384] := e6 -dst[511:448] := e7 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 -dst[191:128] := e2 -dst[255:192] := e3 -dst[319:256] := e4 -dst[383:320] := e5 -dst[447:384] := e6 -dst[511:448] := e7 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 -dst[159:128] := e4 -dst[191:160] := e5 -dst[223:192] := e6 -dst[255:224] := e7 -dst[287:256] := e8 -dst[319:288] := e9 -dst[351:320] := e10 -dst[383:352] := e11 -dst[415:384] := e12 -dst[447:416] := e13 -dst[479:448] := e14 -dst[511:480] := e15 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed 32-bit integers in "dst" with the repeated 4 element sequence in reverse order. - -dst[31:0] := d -dst[63:32] := c -dst[95:64] := b -dst[127:96] := a -dst[159:128] := d -dst[191:160] := c -dst[223:192] := b -dst[255:224] := a -dst[287:256] := d -dst[319:288] := c -dst[351:320] := b -dst[383:352] := a -dst[415:384] := d -dst[447:416] := c -dst[479:448] := b -dst[511:480] := a -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed 64-bit integers in "dst" with the repeated 4 element sequence in reverse order. - -dst[63:0] := d -dst[127:64] := c -dst[191:128] := b -dst[255:192] := a -dst[319:256] := d -dst[383:320] := c -dst[447:384] := b -dst[511:448] := a -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. - -dst[63:0] := d -dst[127:64] := c -dst[191:128] := b -dst[255:192] := a -dst[319:256] := d -dst[383:320] := c -dst[447:384] := b -dst[511:448] := a -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the repeated 4 element sequence in reverse order. - -dst[31:0] := d -dst[63:32] := c -dst[95:64] := b -dst[127:96] := a -dst[159:128] := d -dst[191:160] := c -dst[223:192] := b -dst[255:224] := a -dst[287:256] := d -dst[319:288] := c -dst[351:320] := b -dst[383:352] := a -dst[415:384] := d -dst[447:416] := c -dst[479:448] := b -dst[511:480] := a -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. - -dst[31:0] := e15 -dst[63:32] := e14 -dst[95:64] := e13 -dst[127:96] := e12 -dst[159:128] := e11 -dst[191:160] := e10 -dst[223:192] := e9 -dst[255:224] := e8 -dst[287:256] := e7 -dst[319:288] := e6 -dst[351:320] := e5 -dst[383:352] := e4 -dst[415:384] := e3 -dst[447:416] := e2 -dst[479:448] := e1 -dst[511:480] := e0 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - Set packed 64-bit integers in "dst" with the supplied values in reverse order. - -dst[63:0] := e7 -dst[127:64] := e6 -dst[191:128] := e5 -dst[255:192] := e4 -dst[319:256] := e3 -dst[383:320] := e2 -dst[447:384] := e1 -dst[511:448] := e0 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[63:0] := e7 -dst[127:64] := e6 -dst[191:128] := e5 -dst[255:192] := e4 -dst[319:256] := e3 -dst[383:320] := e2 -dst[447:384] := e1 -dst[511:448] := e0 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[31:0] := e15 -dst[63:32] := e14 -dst[95:64] := e13 -dst[127:96] := e12 -dst[159:128] := e11 -dst[191:160] := e10 -dst[223:192] := e9 -dst[255:224] := e8 -dst[287:256] := e7 -dst[319:288] := e6 -dst[351:320] := e5 -dst[383:352] := e4 -dst[415:384] := e3 -dst[447:416] := e2 -dst[479:448] := e1 -dst[511:480] := e0 -dst[MAX:512] := 0 - - AVX512F -
immintrin.h
- Set -
- - - - Return vector of type __m512 with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - Return vector of type __m512i with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - Return vector of type __m512d with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - Return vector of type __m512 with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - Return vector of type __m512i with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512F -
immintrin.h
- Set -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src << count) OR (src >> (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the left by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE LEFT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src << count) OR (src >> (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in "imm8", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 32-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_DWORDS(src, count_src) { - count := count_src % 32 - RETURN (src >>count) OR (src << (32 - count)) -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Rotate the bits in each packed 64-bit integer in "a" to the right by the number of bits specified in the corresponding element of "b", and store the results in "dst". - -DEFINE RIGHT_ROTATE_QWORDS(src, count_src) { - count := count_src % 64 - RETURN (src >> count) OR (src << (64 - count)) -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0x0) - ELSE - dst[i+63:i] := SignExtend64(a[i+63:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := SignExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := (a[i+63] ? 0xFFFFFFFFFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF count[i+63:i] < 64 - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (1.0 / a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[63:0] := (1.0 / b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[63:0] := (1.0 / b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -dst[63:0] := (1.0 / b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[31:0] := (1.0 / b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[31:0] := (1.0 / b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -dst[31:0] := (1.0 / b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (1.0 / SQRT(a[i+63:i])) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 2^-14. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[63:0] := (1.0 / SQRT(b[63:0])) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[63:0] := (1.0 / SQRT(b[63:0])) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". The maximum relative error for this approximation is less than 2^-14. - -dst[63:0] := (1.0 / SQRT(b[63:0])) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[31:0] := (1.0 / SQRT(b[31:0])) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -IF k[0] - dst[31:0] := (1.0 / SQRT(b[31:0])) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 2^-14. - -dst[31:0] := (1.0 / SQRT(b[31:0])) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note]. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := SQRT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - [round_note]. - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := SQRT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - [round_note]. - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := src[63:0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst[63:0] := SQRT(b[63:0]) -ELSE - dst[63:0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := SQRT(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := src[31:0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst[31:0] := SQRT(b[31:0]) -ELSE - dst[31:0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := SQRT(b[31:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512F -
immintrin.h
- Elementary Math Functions -
- - - - Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512d to type __m128d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512 to type __m128. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512d to type __m256d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512 to type __m256. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512i to type __m128i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512i to type __m256i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Return vector of type __m512 with undefined elements. - AVX512F -
immintrin.h
- General Support -
- - - Return vector of type __m512i with undefined elements. - AVX512F -
immintrin.h
- General Support -
- - - Return vector of type __m512d with undefined elements. - AVX512F -
immintrin.h
- General Support -
- - - Return vector of type __m512 with undefined elements. - AVX512F -
immintrin.h
- General Support -
- - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] + b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := c[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := c[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). RM. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] * b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] * b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] + b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - -FOR j := 0 to 15 - i := j*32 - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - [round_note] - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 32-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[31:0] + src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_ADD(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 0 - FI -ENDFOR -dst[31:0] := REDUCE_ADD(tmp, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 64-bit integers in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[63:0] + src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_ADD(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 0 - FI -ENDFOR -dst[63:0] := REDUCE_ADD(tmp, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[63:0] + src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_ADD(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 0 - FI -ENDFOR -dst[63:0] := REDUCE_ADD(tmp, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition using mask "k". Returns the sum of all active elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[31:0] + src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_ADD(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 0 - FI -ENDFOR -dst[31:0] := REDUCE_ADD(tmp, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 32-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[31:0] * src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_MUL(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 1 - FI -ENDFOR -dst[31:0] := REDUCE_MUL(tmp, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed 64-bit integers in "a" by multiplication using mask "k". Returns the product of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[63:0] * src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_MUL(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 1 - FI -ENDFOR -dst[63:0] := REDUCE_MUL(tmp, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[63:0] * src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_MUL(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 1.0 - FI -ENDFOR -dst[63:0] := REDUCE_MUL(tmp, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication using mask "k". Returns the product of all active elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[31:0] * src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_MUL(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := FP32(1.0) - FI -ENDFOR -dst[31:0] := REDUCE_MUL(tmp, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 32-bit integers in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[31:0] + src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_ADD(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_ADD(a, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 64-bit integers in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[63:0] + src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_ADD(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_ADD(a, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[63:0] + src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] + src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_ADD(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_ADD(a, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". - -DEFINE REDUCE_ADD(src, len) { - IF len == 2 - RETURN src[31:0] + src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] + src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_ADD(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_ADD(a, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 32-bit integers in "a" by multiplication. Returns the product of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[31:0] * src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_MUL(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MUL(a, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed 64-bit integers in "a" by multiplication. Returns the product of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[63:0] * src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_MUL(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MUL(a, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[63:0] * src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] * src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_MUL(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MUL(a, 8) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". - -DEFINE REDUCE_MUL(src, len) { - IF len == 2 - RETURN src[31:0] * src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] * src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_MUL(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MUL(a, 16) - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ABS(v2[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Finds the absolute value of each packed single-precision (32-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ABS(v2[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ABS(v2[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Finds the absolute value of each packed double-precision (64-bit) floating-point element in "v2", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ABS(v2[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst". - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) -dst[511:0] := temp[511:0] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Concatenate "a" and "b" into a 128-byte immediate result, shift the result right by "imm8" 32-bit elements, and store the low 64 bytes (16 elements) in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -temp[1023:512] := a[511:0] -temp[511:0] := b[511:0] -temp[1023:0] := temp[1023:0] >> (32*imm8[3:0]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := temp[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - [sae_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "interv" and the sign depends on "sc" and the source sign. - [getmant_note][sae_note] - FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Blend packed 32-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Blend packed 64-bit integers from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_mask_permutexvar_epi32", and it is recommended that you use that intrinsic name. - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - IF k[j] - dst[i+31:i] := a[id+31:id] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle 32-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the "permutevar" name. This intrinsic is identical to "_mm512_permutexvar_epi32", and it is recommended that you use that intrinsic name. - -FOR j := 0 to 15 - i := j*32 - id := idx[i+3:i]*32 - dst[i+31:i] := a[id+31:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := tmp_dst[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - Shuffle 32-bit integers in "a" within 128-bit lanes using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) -dst[159:128] := SELECT4(a[255:128], imm8[1:0]) -dst[191:160] := SELECT4(a[255:128], imm8[3:2]) -dst[223:192] := SELECT4(a[255:128], imm8[5:4]) -dst[255:224] := SELECT4(a[255:128], imm8[7:6]) -dst[287:256] := SELECT4(a[383:256], imm8[1:0]) -dst[319:288] := SELECT4(a[383:256], imm8[3:2]) -dst[351:320] := SELECT4(a[383:256], imm8[5:4]) -dst[383:352] := SELECT4(a[383:256], imm8[7:6]) -dst[415:384] := SELECT4(a[511:384], imm8[1:0]) -dst[447:416] := SELECT4(a[511:384], imm8[3:2]) -dst[479:448] := SELECT4(a[511:384], imm8[5:4]) -dst[511:480] := SELECT4(a[511:384], imm8[7:6]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Swizzle -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". - -FOR j := 0 to 7 - i := j*64 - k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". - FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". - FOR j := 0 to 7 - i := j*64 - k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (!(a[i+63:i] <= b[i+63:i])) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (!(a[i+63:i] < b[i+63:i])) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 7 - i := j*64 - IF k1[j] - k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k". - FOR j := 0 to 15 - i := j*32 - k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k". - FOR j := 0 to 15 - i := j*32 - k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (!(a[i+31:i] <= b[i+31:i])) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := (!(a[i+31:i] < b[i+31:i])) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] != NaN) AND (b[i+31:i] != NaN)) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] == NaN) OR (b[i+31:i] == NaN)) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k". - -FOR j := 0 to 15 - i := j*32 - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[2:0]) OF -0: OP := _MM_CMPINT_EQ -1: OP := _MM_CMPINT_LT -2: OP := _MM_CMPINT_LE -3: OP := _MM_CMPINT_FALSE -4: OP := _MM_CMPINT_NE -5: OP := _MM_CMPINT_NLT -6: OP := _MM_CMPINT_NLE -7: OP := _MM_CMPINT_TRUE -ESAC -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for equality, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for greater-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for less-than-or-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Compare packed unsigned 32-bit integers in "a" and "b" for not-equal, and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Compare -
- - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed double-precision (64-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed single-precision (32-bit) floating-point elements from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 16 packed 32-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits of integer data from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed 32-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 8 packed 64-bit integers) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Load packed 64-bit integers from memory into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst". "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). Gathered elements are merged into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+31:i] := MEM[addr+31:addr] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Loads 8 64-bit integer elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" them in "dst". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - - - Loads 8 double-precision (64-bit) floating-point elements from memory starting at location "base_addr" at packed 32-bit integer indices stored in the lower half of "vindex" scaled by "scale" into "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - dst[i+63:i] := MEM[addr+63:addr] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Load -
- - - - - - Move packed double-precision (64-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Move packed single-precision (32-bit) floating-point elements from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Move packed 32-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Move packed 64-bit integers from "a" to "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Move -
- - - - - - Store packed double-precision (64-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed single-precision (32-bit) floating-point elements from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed 32-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 15 - i := j*32 - IF k[j] - MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 16 packed 32-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits of integer data from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - Store packed 64-bit integers from "a" into memory using writemask "k". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -FOR j := 0 to 7 - i := j*64 - IF k[j] - MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 8 packed 64-bit integers) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter 32-bit integers from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale"). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Scatter single-precision (32-bit) floating-point elements from "a" into memory using 32-bit indices. 32-bit elements are stored at addresses starting at "base_addr" and offset by each 32-bit element in "vindex" (each index is scaled by the factor in "scale") subject to mask "k" (elements are not stored when the corresponding mask bit is not set). "scale" should be 1, 2, 4 or 8. - -FOR j := 0 to 15 - i := j*32 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+31:addr] := a[i+31:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Stores 8 packed double-precision (64-bit) floating-point elements in "a" and to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Stores 8 packed double-precision (64-bit) floating-point elements in "a" to memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". Only those elements whose corresponding mask bit is set in writemask "k" are written to memory. - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] AND b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[511:0] := (a[511:0] AND b[511:0]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of 512 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[511:0] := ((NOT a[511:0]) AND b[511:0]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed 32-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in "a" and then AND with "b", and store the results in "dst". - -dst[511:0] := ((NOT a[511:0]) AND b[511:0]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise NOT of packed 64-bit integers in "a" and then AND with "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in "a" and "b", and store the results in "dst". - -dst[511:0] := (a[511:0] AND b[511:0]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise AND of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] AND b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] OR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[511:0] := (a[511:0] OR b[511:0]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] OR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed 64-bit integers in "a" and "b", and store the resut in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" (subject to writemask "k") if the intermediate value is non-zero. - -FOR j := 0 to 15 - i := j*32 - IF k1[j] - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise AND of packed 32-bit integers in "a" and "b", producing intermediate 32-bit values, and set the corresponding bit in result mask "k" if the intermediate value is non-zero. - -FOR j := 0 to 15 - i := j*32 - k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of 512 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[511:0] := (a[511:0] XOR b[511:0]) -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - Reduce the packed 32-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[31:0] AND src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_AND(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 0xFFFFFFFF - FI -ENDFOR -dst[31:0] := REDUCE_AND(tmp, 16) - - AVX512F -
immintrin.h
- Logical -
- - - - - Reduce the packed 64-bit integers in "a" by bitwise AND using mask "k". Returns the bitwise AND of all active elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[63:0] AND src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_AND(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF - FI -ENDFOR -dst[63:0] := REDUCE_AND(tmp, 8) - - AVX512F -
immintrin.h
- Logical -
- - - - - Reduce the packed 32-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[31:0] OR src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_OR(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 0 - FI -ENDFOR -dst[31:0] := REDUCE_OR(tmp, 16) - - AVX512F -
immintrin.h
- Logical -
- - - - - Reduce the packed 64-bit integers in "a" by bitwise OR using mask "k". Returns the bitwise OR of all active elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[63:0] OR src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_OR(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 0 - FI -ENDFOR -dst[63:0] := REDUCE_OR(tmp, 8) - - AVX512F -
immintrin.h
- Logical -
- - - - Reduce the packed 32-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[31:0] AND src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] AND src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_AND(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_AND(a, 16) - - AVX512F -
immintrin.h
- Logical -
- - - - Reduce the packed 64-bit integers in "a" by bitwise AND. Returns the bitwise AND of all elements in "a". - -DEFINE REDUCE_AND(src, len) { - IF len == 2 - RETURN src[63:0] AND src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] AND src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_AND(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_AND(a, 8) - - AVX512F -
immintrin.h
- Logical -
- - - - Reduce the packed 32-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[31:0] OR src[63:32] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := src[i+31:i] OR src[i+32*len+31:i+32*len] - ENDFOR - RETURN REDUCE_OR(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_OR(a, 16) - - AVX512F -
immintrin.h
- Logical -
- - - - Reduce the packed 64-bit integers in "a" by bitwise OR. Returns the bitwise OR of all elements in "a". - -DEFINE REDUCE_OR(src, len) { - IF len == 2 - RETURN src[63:0] OR src[127:64] - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := src[i+63:i] OR src[i+64*len+63:i+64*len] - ENDFOR - RETURN REDUCE_OR(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_OR(a, 8) - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Performs element-by-element bitwise AND between packed 32-bit integer elements of "v2" and "v3", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := v2[i+31:i] & v3[i+31:i] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Logical -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MAX(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := Int32(-0x80000000) - FI -ENDFOR -dst[31:0] := REDUCE_MAX(tmp, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MAX(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := Int64(-0x8000000000000000) - FI -ENDFOR -dst[63:0] := REDUCE_MAX(tmp, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MAX(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 0 - FI -ENDFOR -dst[31:0] := REDUCE_MAX(tmp, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 64-bit integers in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MAX(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 0 - FI -ENDFOR -dst[63:0] := REDUCE_MAX(tmp, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MAX(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := Cast_FP64(0xFFEFFFFFFFFFFFFF) - FI -ENDFOR -dst[63:0] := REDUCE_MAX(tmp, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the maximum of all active elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MAX(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := Cast_FP32(0xFF7FFFFF) - FI -ENDFOR -dst[31:0] := REDUCE_MAX(tmp, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MIN(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := Int32(0x7FFFFFFF) - FI -ENDFOR -dst[31:0] := REDUCE_MIN(tmp, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed signed 64-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MIN(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := Int64(0x7FFFFFFFFFFFFFFF) - FI -ENDFOR -dst[63:0] := REDUCE_MIN(tmp, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 32-bit integers in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MIN(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := 0xFFFFFFFF - FI -ENDFOR -dst[31:0] := REDUCE_MIN(tmp, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed unsigned 64-bit integers in "a" by minimum using mask "k". Returns the minimum of all active elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MIN(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := 0xFFFFFFFFFFFFFFFF - FI -ENDFOR -dst[63:0] := REDUCE_MIN(tmp, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MIN(src[64*len-1:0], len) -} -tmp := a -FOR j := 0 to 8 - i := j*64 - IF k[j] - tmp[i+63:i] := a[i+63:i] - ELSE - tmp[i+63:i] := Cast_FP64(0x7FEFFFFFFFFFFFFF) - FI -ENDFOR -dst[63:0] := REDUCE_MIN(tmp, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum using mask "k". Returns the minimum of all active elements in "a". [min_float_note] - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MIN(src[32*len-1:0], len) -} -tmp := a -FOR j := 0 to 16 - i := j*32 - IF k[j] - tmp[i+31:i] := a[i+31:i] - ELSE - tmp[i+31:i] := Cast_FP32(0x7F7FFFFF) - FI -ENDFOR -dst[31:0] := REDUCE_MIN(tmp, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MAX(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MAX(a, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MAX(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MAX(a, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 32-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MAX(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MAX(a, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 64-bit integers in "a" by maximum. Returns the maximum of all elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MAX(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MAX(a, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[63:0] > src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] > src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MAX(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MAX(a, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". - -DEFINE REDUCE_MAX(src, len) { - IF len == 2 - RETURN (src[31:0] > src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] > src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MAX(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MAX(a, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MIN(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MIN(a, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed signed 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MIN(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MIN(a, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 32-bit integers in "a" by minimum. Returns the minimum of all elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MIN(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MIN(a, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed unsigned 64-bit integers in "a" by minimum. Returns the minimum of all elements in "a". - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MIN(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MIN(a, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed double-precision (64-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[63:0] < src[127:64] ? src[63:0] : src[127:64]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*64 - src[i+63:i] := (src[i+63:i] < src[i+64*len+63:i+64*len] ? src[i+63:i] : src[i+64*len+63:i+64*len]) - ENDFOR - RETURN REDUCE_MIN(src[64*len-1:0], len) -} -dst[63:0] := REDUCE_MIN(a, 8) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - Reduce the packed single-precision (32-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] - -DEFINE REDUCE_MIN(src, len) { - IF len == 2 - RETURN (src[31:0] < src[63:32] ? src[31:0] : src[63:32]) - FI - len := len / 2 - FOR j:= 0 to (len-1) - i := j*32 - src[i+31:i] := (src[i+31:i] < src[i+32*len+31:i+32*len] ? src[i+31:i] : src[i+32*len+31:i+32*len]) - ENDFOR - RETURN REDUCE_MIN(src[32*len-1:0], len) -} -dst[31:0] := REDUCE_MIN(a, 16) - - AVX512F -
immintrin.h
- Special Math Functions -
- - - - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by the amount specified by the corresponding element in "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 15 - i := j*32 - IF count[i+31:i] < 32 - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Shift -
- - - - Cast vector of type __m512d to type __m512. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512d to type __m512i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512 to type __m512d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512 to type __m512i. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512i to type __m512d. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Cast vector of type __m512i to type __m512. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512F -
immintrin.h
- Cast -
- - - - Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". - -FOR j := 0 to 7 - i := j*32 - n := j*64 - dst[n+63:n] := Convert_FP32_To_FP64(v2[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[l+63:l] := Convert_FP32_To_FP64(v2[i+31:i]) - ELSE - dst[l+63:l] := src[l+63:l] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". - -FOR j := 0 to 7 - i := j*32 - l := j*64 - dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Performs element-by-element conversion of the lower half of packed 32-bit integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - n := j*64 - IF k[j] - dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) - ELSE - dst[n+63:n] := src[n+63:n] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst". - -FOR j := 0 to 7 - i := j*32 - n := j*64 - dst[n+63:n] := Convert_Int32_To_FP64(v2[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in "v2" to packed double-precision (64-bit) floating-point elements, storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - l := j*64 - IF k[j] - dst[l+63:l] := Convert_Int32_To_FP64(v2[i+31:i]) - ELSE - dst[l+63:l] := src[l+63:l] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst". The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. - -FOR j := 0 to 7 - i := j*64 - k := j*32 - dst[k+31:k] := Convert_FP64_To_FP32(v2[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in "v2" to single-precision (32-bit) floating-point elements and stores them in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0. - -FOR j := 0 to 7 - i := j*64 - l := j*32 - IF k[j] - dst[l+31:l] := Convert_FP64_To_FP32(v2[i+63:i]) - ELSE - dst[l+31:l] := src[l+31:l] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512F -
immintrin.h
- Convert -
- - - - - - - Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale". - -FOR j := 0 to 7 - i := j*64 - m := j*32 - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - - Stores 8 packed 64-bit integer elements located in "a" and stores them in memory locations starting at location "base_addr" at packed 32-bit integer indices stored in "vindex" scaled by "scale" using writemask "k" (elements whose corresponding mask bit is not set are not written to memory). - -FOR j := 0 to 7 - i := j*64 - m := j*32 - IF k[j] - addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 - MEM[addr+63:addr] := a[i+63:i] - FI -ENDFOR - - - AVX512F -
immintrin.h
- Store -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512IFMA52 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512IFMA52 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512IFMA52 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512IFMA52 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512IFMA52 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512IFMA52 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "b" and "c" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i]) - dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512IFMA52 -
immintrin.h
- Arithmetic -
- - - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := POPCNT(a[i+63:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POPCNT(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := POPCNT(a[i+31:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := POPCNT(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512VPOPCNTDQ - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := POPCNT(a[i+31:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512VPOPCNTDQ -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512VPOPCNTDQ -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 32-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := POPCNT(a[i+31:i]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512VPOPCNTDQ -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := POPCNT(a[i+63:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512VPOPCNTDQ -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512VPOPCNTDQ -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 64-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := POPCNT(a[i+63:i]) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512VPOPCNTDQ -
immintrin.h
- Bit Manipulation -
- - - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 15 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:512] := 0 - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 15 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 15 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - Convert the BF16 (16-bit) floating-point element in "a" to a floating-point element, and store the result in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -dst[31:0] := Convert_BF16_To_FP32(a[15:0]) - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". - -FOR j := 0 to 31 - IF j < 16 - t := b.fp32[j] - ELSE - t := a.fp32[j-16] - FI - dst.word[j] := Convert_FP32_To_BF16(t) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF j < 16 - t := b.fp32[j] - ELSE - t := a.fp32[j-16] - FI - dst.word[j] := Convert_FP32_To_BF16(t) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF j < 16 - t := b.fp32[j] - ELSE - t := a.fp32[j-16] - FI - dst.word[j] := Convert_FP32_To_BF16(t) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Convert -
- - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 15 - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 15 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 15 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BF16 - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 3 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:128] := 0 - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 3 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 3 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 7 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:256] := 0 - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 7 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed BF16 (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic neither raises any floating point exceptions nor turns sNAN into qNAN. - -FOR j := 0 to 7 - i := j*32 - m := j*16 - IF k[j] - dst[i+31:i] := Convert_BF16_To_FP32(a[m+15:m]) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert the single-precision (32-bit) floating-point element in "a" to a BF16 (16-bit) floating-point element, and store the result in "dst". - -dst[15:0] := Convert_FP32_To_BF16(a[31:0]) - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". - -FOR j := 0 to 7 - IF j < 4 - t := b.fp32[j] - ELSE - t := a.fp32[j-4] - FI - dst.word[j] := Convert_FP32_To_BF16(t) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF j < 4 - t := b.fp32[j] - ELSE - t := a.fp32[j-4] - FI - dst.word[j] := Convert_FP32_To_BF16(t) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF j < 4 - t := b.fp32[j] - ELSE - t := a.fp32[j-4] - FI - dst.word[j] := Convert_FP32_To_BF16(t) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst". - -FOR j := 0 to 15 - IF j < 8 - t := b.fp32[j] - ELSE - t := a.fp32[j-8] - FI - dst.word[j] := Convert_FP32_To_BF16(t) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF j < 8 - t := b.fp32[j] - ELSE - t := a.fp32[j-8] - FI - dst.word[j] := Convert_FP32_To_BF16(t) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in two vectors "a" and "b" to packed BF16 (16-bit) floating-point elements, and store the results in single vector "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF j < 8 - t := b.fp32[j] - ELSE - t := a.fp32[j-8] - FI - dst.word[j] := Convert_FP32_To_BF16(t) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed BF16 (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.word[j] := Convert_FP32_To_BF16(a.fp32[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 3 - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 3 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 3 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst". - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 7 - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 7 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute dot-product of BF16 (16-bit) floating-point pairs in "a" and "b", accumulating the intermediate single-precision (32-bit) floating-point elements with elements in "src", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE make_fp32(x[15:0]) { - y.fp32 := 0.0 - y[31:16] := x[15:0] - RETURN y -} -dst := src -FOR j := 0 to 7 - IF k[j] - dst.fp32[j] += make_fp32(a.bf16[2*j+1]) * make_fp32(b.bf16[2*j+1]) - dst.fp32[j] += make_fp32(a.bf16[2*j+0]) * make_fp32(b.bf16[2*j+0]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BF16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 3 //Qword - FOR j := 0 to 7 // Byte - IF k[i*8+j] - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ELSE - dst[i*8+j] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:32] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". - -FOR i := 0 to 3 //Qword - FOR j := 0 to 7 // Byte - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ENDFOR -ENDFOR -dst[MAX:32] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 1 //Qword - FOR j := 0 to 7 // Byte - IF k[i*8+j] - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ELSE - dst[i*8+j] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:16] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". - -FOR i := 0 to 1 //Qword - FOR j := 0 to 7 // Byte - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ENDFOR -ENDFOR -dst[MAX:16] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POPCNT(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POPCNT(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*8 - dst[i+7:i] := POPCNT(a[i+7:i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := POPCNT(a[i+7:i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_BITALG - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 7 //Qword - FOR j := 0 to 7 // Byte - IF k[i*8+j] - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ELSE - dst[i*8+j] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:64] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - - Gather 64 bits from "b" using selection bits in "c". For each 64-bit element in "b", gather 8 bits from the 64-bit element in "b" at 8 bit position controlled by the 8 corresponding 8-bit elements of "c", and store the result in the corresponding 8-bit element of "dst". - -FOR i := 0 to 7 //Qword - FOR j := 0 to 7 // Byte - m := c.qword[i].byte[j] & 0x3F - dst[i*8+j] := b.qword[i].bit[m] - ENDFOR -ENDFOR -dst[MAX:64] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POPCNT(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 16-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := POPCNT(a[i+15:i]) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst". - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 63 - i := j*8 - dst[i+7:i] := POPCNT(a[i+7:i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - - Count the number of logical 1 bits in packed 8-bit integers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE POPCNT(a) { - count := 0 - DO WHILE a > 0 - count += a[0] - a >>= 1 - OD - RETURN count -} -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := POPCNT(a[i+7:i]) - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_BITALG -
immintrin.h
- Bit Manipulation -
- - - - - Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ACOS(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ACOSH(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ASIN(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ASINH(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ATAN(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ATANH(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := CubeRoot(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := CDFNormal(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := InverseCDFNormal(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := COS(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := COSD(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := COSH(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ERF(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := 1.0 - ERF(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := 1.0 / ERF(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POW(FP16(e), a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0)) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := InvCubeRoot(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := InvSQRT(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := LOG(1.0 + a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - - Elementary Math FunctionsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ConvertExpFP16(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := POW(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SIN(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SIN(a[i+15:i]) - MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -cos_res[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SIND(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SINH(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := CEIL(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := FLOOR(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ROUND(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". - - Elementary Math Functions -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := SQRT(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := TAN(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := TAND(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := TANH(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst" - - Special Math FunctionsFOR j := 0 to 15 - i := j*16 - dst[i+15:i] := TRUNCATE(a[i+15:i]) -ENDFOR -dst[MAX:256] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ACOS(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ACOSH(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ASIN(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ASINH(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ATAN(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperblic tangent of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" expressed in radians. - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ATANH(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := CubeRoot(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := CDFNormal(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := InverseCDFNormal(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := CEIL(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := COS(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := COSD(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := COSH(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ERF(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := 1.0 - ERF(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := 1.0 / ERF(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POW(FP16(e), a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := FLOOR(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0)) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := InvSQRT(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := LOG(1.0 + a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - - Elementary Math FunctionsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ConvertExpFP16(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ACOS(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ACOSH(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ASIN(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ASINH(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ATAN(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ATANH(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math FunctionsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := CubeRoot(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := CDFNormal(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := InverseCDFNormal(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Special Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := CEIL(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := COS(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - TrigonometryFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := COSD(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := COSH(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ERF(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := 1.0 - ERF(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Probability/StatisticsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := 1.0 / ERF(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := POW(FP16(e), a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Special Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := FLOOR(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math FunctionsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := InvSQRT(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := LOG(1.0 + a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := LOG(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - - - - Elementary Math FunctionsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ConvertExpFP16(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Rounds each packed half-precision (16-bit) floating-point element in "a" to the nearest integer value and stores the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Special Math FunctionsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := NearbyInt(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Computes the reciprocal of packed half-precision (16-bit) floating-point elements in "a", storing the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := (1.0 / a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Rounds the packed half-precision (16-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Special Math FunctionsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := RoundToNearestEven(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SIN(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", store the cosine into memory at "mem_addr". Elements are written to their respective locations using writemask "k" (elements are copied from "sin_src" or "cos_src" when the corresponding mask bit is not set). - - - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SIN(a[i+15:i]) - MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) - ELSE - dst[i+15:i] := sin_src[i+15:i] - MEM[mem_addr+i+15:mem_addr+i] := cos_src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -cos_res[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - TrigonometryFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SIND(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := SINH(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Special Math Functions -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ROUND(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := TAN(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - TrigonometryFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := TAND(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := TANH(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - - - - Special Math FunctionsFOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := TRUNCATE(a[i+15:i]) - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Rounds each packed half-precision (16-bit) floating-point element in "a" to the nearest integer value and stores the results as packed half-precision floating-point elements in "dst". - - Special Math FunctionsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := NearbyInt(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := POW(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Computes the reciprocal of packed half-precision (16-bit) floating-point elements in "a", storing the results in "dst". - - Elementary Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := (1.0 / a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Rounds the packed half-precision (16-bit) floating-point elements in "a" to the nearest even integer value and stores the results in "dst". - - Special Math FunctionsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := RoundToNearestEven(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SIN(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SIN(a[i+15:i]) - MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -cos_res[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SIND(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := SINH(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ROUND(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := TAN(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := TAND(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := TANH(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst". - - Special Math FunctionsFOR j := 0 to 31 - i := j*16 - dst[i+15:i] := TRUNCATE(a[i+15:i]) -ENDFOR -dst[MAX:512] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ACOS(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ACOSH(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ASIN(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ASINH(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ATAN2(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ATAN(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ATANH(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := CubeRoot(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := CDFNormal(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cumulative distribution function of packed half-precision (16-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := InverseCDFNormal(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := COS(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := COSD(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := COSH(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ERF(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := 1.0 - ERF(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse complementary error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := 1.0 / (1.0 - ERF(a[i+15:i])) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse error function of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Probability/StatisticsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := 1.0 / ERF(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 10 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POW(FP16(10.0), a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of 2 raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POW(FP16(2.0), a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POW(FP16(e), a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of "e" raised to the power of packed half-precision (16-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POW(FP16(e), a[i+15:i]) - 1.0 -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SQRT(POW(a[i+15:i], 2.0) + POW(b[i+15:i], 2.0)) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse cube root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := InvCubeRoot(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the inverse square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math FunctionsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := InvSQRT(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-10 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) / LOG(10.0) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of one plus packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := LOG(1.0 + a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the base-2 logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) / LOG(2.0) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the natural logarithm of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := LOG(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - - Elementary Math FunctionsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ConvertExpFP16(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the exponential value of packed half-precision (16-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := POW(a[i+15:i], b[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SIN(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine and cosine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SIN(a[i+15:i]) - MEM[mem_addr+i+15:mem_addr+i] := COS(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -cos_res[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the sine of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SIND(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic sine of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SINH(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" up to an integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := CEIL(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" down to an integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := FLOOR(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Round the packed half-precision (16-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed half-precision floating-point elements in "dst". - - Special Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ROUND(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". - - Elementary Math Functions -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SQRT(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := TAN(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - - TrigonometryFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := TAND(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Compute the hyperbolic tangent of packed half-precision (16-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - - Trigonometry -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := TANH(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - Truncate the packed half-precision (16-bit) floating-point elements in "a", and store the results as packed half-precision floating-point elements in "dst". - - Special Math FunctionsFOR j := 0 to 7 - i := j*16 - dst[i+15:i] := TRUNCATE(a[i+15:i]) -ENDFOR -dst[MAX:128] := 0 -
immintrin.h
AVX512_FP16
- - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := a.fp16[j] + b.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 TO 15 - dst.fp16[j] := a.fp16[j] + b.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := a.fp16[j] / b.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := a.fp16[j] / b.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 15 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := a.fp16[j] - b.fp16[j] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 TO 15 - dst.fp16[j] := a.fp16[j] - b.fp16[j] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR i := 0 TO 7 - dst.fp16[i] := a.fp16[i] * b.fp16[i] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 TO 7 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 TO 7 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR i := 0 TO 15 - dst.fp16[i] := a.fp16[i] * b.fp16[i] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 TO 15 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 TO 15 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 3 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 7 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". - -tmp := a -FOR i := 0 to 7 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8] -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] -ENDFOR -dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (316-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". - -tmp := a -FOR i := 0 to 7 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8] -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] -ENDFOR -dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". - -tmp := a -FOR i := 0 to 7 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) -ENDFOR -dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". - -tmp := a -FOR i := 0 to 7 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) -ENDFOR -dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". - -tmp := a -FOR i := 0 to 3 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] -ENDFOR -dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". - -tmp := a -FOR i := 0 to 3 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] -ENDFOR -dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". - -tmp := a -FOR i := 0 to 3 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) -ENDFOR -dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". - -tmp := a -FOR i := 0 to 3 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) -ENDFOR -dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := ABS(v2.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := ABS(v2.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) -ENDFOR -dst[MAX:256] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 -ENDFOR -k[MAX:8] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 7 - IF k1[j] - k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Compare -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 -ENDFOR -k[MAX:16] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Compare -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 15 - IF k1[j] - k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Compare -
- - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 15 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 15 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 96 bits of "dst" are zeroed out. - -FOR j := 0 TO 1 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:32] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 TO 3 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 to 3 - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 to 3 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The upper 64 bits of "dst" are zeroed out. - -FOR j := 0 to 3 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:64] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 TO 3 - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 TO 7 - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 3 - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 7 - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 TO 3 - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 TO 7 - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 3 - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 7 - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 TO 1 - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 TO 3 - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 1 - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 3 - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 TO 1 - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 TO 3 - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 1 - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 1 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 3 - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 3 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 TO 7 - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 TO 15 - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 7 - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 15 - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". - -FOR j := 0 TO 7 - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". - -FOR j := 0 TO 15 - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 7 - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 15 - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := src.fp64[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := src.fp64[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := src.fp32[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := src.fp32[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Convert -
- - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 7 - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 15 - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [max_float_note] - -dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] - -dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][max_float_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] > b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 7 - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 15 - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [min_float_note] - -dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] - -dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [sae_note][min_float_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] < b.fp16[0] ? a.fp16[0] : b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Special Math Functions -
- - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 7 - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) -ENDFOR -dest[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dest[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dest[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 15 - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) -ENDFOR -dest[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dest[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dest[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 7 - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 15 - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 7 - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 7 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 7 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 15 - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 15 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 15 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 7 - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 15 - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 7 - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR i := 0 to 7 - k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) -ENDFOR -k[MAX:8] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR i := 0 to 7 - IF k1[i] - k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) - ELSE - k[i] := 0 - FI -ENDFOR -k[MAX:8] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR i := 0 to 15 - k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) -ENDFOR -k[MAX:16] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR i := 0 to 15 - IF k1[i] - k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) - ELSE - k[i] := 0 - FI -ENDFOR -k[MAX:16] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle half-precision (16-bit) floating-point elements in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - off := idx[i+2:i] - dst.fp16[j] := idx[i+3] ? b.fp16[off] : a.fp16[off] -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - off := idx[i+3:i] - dst.fp16[j] := idx[i+4] ? b.fp16[off] : a.fp16[off] -ENDFOR -dst[MAX:256] := 0 - - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := b.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 7 - IF k[j] - dst.fp16[j] := b.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*16 - id := idx[i+3:i] - dst.fp16[j] := a.fp16[id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - - Shuffle half-precision (16-bit) floating-point elements in "a" using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - id := idx[i+2:i] - dst.fp16[j] := a.fp16[id] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Miscellaneous -
- - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 7 - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 15 - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - -FOR i := 0 to 7 - dst.fp16[i] := SQRT(a.fp16[i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - -FOR i := 0 to 15 - dst.fp16[i] := SQRT(a.fp16[i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 7 - dst.fp16[i] := (1.0 / a.fp16[i]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := (1.0 / a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 7 - IF k[i] - dst.fp16[i] := (1.0 / a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 15 - dst.fp16[i] := (1.0 / a.fp16[i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := (1.0 / a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := (1.0 / a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Elementary Math Functions -
- - - - Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Load -
- - - - Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[255:0] := MEM[mem_addr+255:mem_addr] -dst[MAX:256] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] -dst[MAX:128] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Load -
- - - - - Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 32-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Store -
- - - - - Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+255:mem_addr] := a[255:0] - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Store -
- - - - Return vector of type __m256h with undefined elements. - AVX512_FP16 - AVX512VL -
immintrin.h
- General Support -
- - - - Return vector of type __m128h with undefined elements. - AVX512_FP16 - AVX512VL -
immintrin.h
- General Support -
- - - - Return vector of type __m256h with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Set -
- - - - Return vector of type __m128h with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512_FP16 - AVX512VL -
immintrin.h
- Set -
- - - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 TO 31 - dst.fp16[j] := a.fp16[j] + b.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] - -FOR j := 0 TO 31 - dst.fp16[j] := a.fp16[j] + b.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Add packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] + b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := a.fp16[0] + b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := a.fp16[0] + b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] + b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] + b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] + b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Add the lower half-precision (16-bit) floating-point elements in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] + b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 31 - dst.fp16[j] := a.fp16[j] / b.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - dst.fp16[j] := a.fp16[j] / b.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Divide packed half-precision (16-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := a.fp16[j] / b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := a.fp16[0] / b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] / b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] / b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := a.fp16[0] / b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] / b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Divide the lower half-precision (16-bit) floating-point element in "a" by the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] / b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 31 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) + c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 31 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := -(a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "a" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := a.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using writemask "k" (the element is copied from "c" when mask bit 0 is not set), and copy the upper 7 packed elements from "c" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := c.fp16[0] -FI -dst[127:16] := c[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := -(a.fp16[0] * b.fp16[0]) - c.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 31 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 31 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst". - [round_note] - -FOR j := 0 to 31 - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := c.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" to/from the intermediate result, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 31 - IF k[j] - IF ((j & 1) == 0) - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) + c.fp16[j] - ELSE - dst.fp16[j] := (a.fp16[j] * b.fp16[j]) - c.fp16[j] - FI - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 TO 31 - dst.fp16[j] := a.fp16[j] - b.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - [round_note] - -FOR j := 0 TO 31 - dst.fp16[j] := a.fp16[j] - b.fp16[j] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Subtract packed half-precision (16-bit) floating-point elements in "b" from packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := a.fp16[j] - b.fp16[j] - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := a.fp16[0] - b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := a.fp16[0] - b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] - b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] - b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] - b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Subtract the lower half-precision (16-bit) floating-point element in "b" from the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] - b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR i := 0 TO 31 - dst.fp16[i] := a.fp16[i] * b.fp16[i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst". - [round_note] - -FOR i := 0 TO 31 - dst.fp16[i] := a.fp16[i] * b.fp16[i] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed half-precision (16-bit) floating-point elements in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := a.fp16[i] * b.fp16[i] - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := a.fp16[0] * b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := a.fp16[0] * b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] * b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] * b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := a.fp16[0] * b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower half-precision (16-bit) floating-point element in "a" and "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := a.fp16[0] * b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := src.fp16[2*i+0] - dst.fp16[2*i+1] := src.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "src" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := src.fp16[0] - dst.fp16[1] := src.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "src", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" and "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) - (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) + (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := a.fp16[0] - dst.fp16[1] := a.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := c.fp16[0] - dst.fp16[1] := c.fp16[1] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := a.fp16[0] - dst.fp16[1] := a.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := c.fp16[0] - dst.fp16[1] := c.fp16[1] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex numbers in "a" and "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) - (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) + (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := a.fp16[2*i+0] - dst.fp16[2*i+1] := a.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using writemask "k" (elements are copied from "c" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := c.fp16[2*i+0] - dst.fp16[2*i+1] := c.fp16[2*i+1] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply packed complex numbers in "a" by the complex conjugates of packed complex numbers in "b", accumulate to the corresponding complex numbers in "c", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -FOR i := 0 to 15 - IF k[i] - dst.fp16[2*i+0] := (a.fp16[2*i+0] * b.fp16[2*i+0]) + (a.fp16[2*i+1] * b.fp16[2*i+1]) + c.fp16[2*i+0] - dst.fp16[2*i+1] := (a.fp16[2*i+1] * b.fp16[2*i+0]) - (a.fp16[2*i+0] * b.fp16[2*i+1]) + c.fp16[2*i+1] - ELSE - dst.fp16[2*i+0] := 0 - dst.fp16[2*i+1] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := a.fp16[0] - dst.fp16[1] := a.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := c.fp16[0] - dst.fp16[1] := c.fp16[1] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst", and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] -dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "a" when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := a.fp16[0] - dst.fp16[1] := a.fp16[1] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using writemask "k" (elements are copied from "c" when mask bit 0 is not set), and copy the upper 6 packed elements from "c" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := c.fp16[0] - dst.fp16[1] := c.fp16[1] -FI -dst[127:32] := c[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the lower complex number in "a" by the complex conjugate of the lower complex number in "b", accumulate to the lower complex number in "c", and store the result in the lower elements of "dst" using zeromask "k" (elements are zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements from "a" to the upper elements of "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - [round_note] - -IF k[0] - dst.fp16[0] := (a.fp16[0] * b.fp16[0]) + (a.fp16[1] * b.fp16[1]) + c.fp16[0] - dst.fp16[1] := (a.fp16[1] * b.fp16[0]) - (a.fp16[0] * b.fp16[1]) + c.fp16[1] -ELSE - dst.fp16[0] := 0 - dst.fp16[1] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by addition. Returns the sum of all elements in "a". - -tmp := a -FOR i := 0 to 15 - tmp.fp16[i] := tmp.fp16[i] + a.fp16[i+16] -ENDFOR -FOR i := 0 to 7 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+8] -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+4] -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := tmp.fp16[i] + tmp.fp16[i+2] -ENDFOR -dst.fp16[0] := tmp.fp16[0] + tmp.fp16[1] - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by multiplication. Returns the product of all elements in "a". - -tmp := a -FOR i := 0 to 15 - tmp.fp16[i] := tmp.fp16[i] * a.fp16[i+16] -ENDFOR -FOR i := 0 to 7 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+8] -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+4] -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := tmp.fp16[i] * tmp.fp16[i+2] -ENDFOR -dst.fp16[0] := tmp.fp16[0] * tmp.fp16[1] - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by maximum. Returns the maximum of all elements in "a". [max_float_note] - -tmp := a -FOR i := 0 to 15 - tmp.fp16[i] := (a.fp16[i] > a.fp16[i+16] ? a.fp16[i] : a.fp16[i+16]) -ENDFOR -FOR i := 0 to 7 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := (tmp.fp16[i] > tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) -ENDFOR -dst.fp16[0] := (tmp.fp16[0] > tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - Reduce the packed half-precision (16-bit) floating-point elements in "a" by minimum. Returns the minimum of all elements in "a". [min_float_note] - -tmp := a -FOR i := 0 to 15 - tmp.fp16[i] := (a.fp16[i] < a.fp16[i+16] ? tmp.fp16[i] : a.fp16[i+16]) -ENDFOR -FOR i := 0 to 7 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+8] ? tmp.fp16[i] : tmp.fp16[i+8]) -ENDFOR -FOR i := 0 to 3 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+4] ? tmp.fp16[i] : tmp.fp16[i+4]) -ENDFOR -FOR i := 0 to 1 - tmp.fp16[i] := (tmp.fp16[i] < tmp.fp16[i+2] ? tmp.fp16[i] : tmp.fp16[i+2]) -ENDFOR -dst.fp16[0] := (tmp.fp16[0] < tmp.fp16[1] ? tmp.fp16[0] : tmp.fp16[1]) - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - Finds the absolute value of each packed half-precision (16-bit) floating-point element in "v2", storing the results in "dst". - -FOR j := 0 to 31 - dst.fp16[j] := ABS(v2.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst". Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) -ENDFOR -dst[MAX:512] := 0 - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - Compute the complex conjugates of complex numbers in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number "complex = vec.fp16[0] + i * vec.fp16[1]", or the complex conjugate "conjugate = vec.fp16[0] - i * vec.fp16[1]". - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := a[i+31:i] XOR FP32(-0.0) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - AVX512_FP16 -
immintrin.h
- Arithmetic -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 31 - k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 31 - IF k1[j] - k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 31 - k[j] := (a.fp16[j] OP b.fp16[j]) ? 1 : 0 -ENDFOR -k[MAX:32] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - CASE (imm8[3:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -FOR j := 0 to 31 - IF k1[j] - k[j] := ( a.fp16[j] OP b.fp16[j] ) ? 1 : 0 - ELSE - k[j] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0 -k[MAX:1] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k". [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -k[0] := (a.fp16[0] OP b.fp16[0]) ? 1 : 0 -k[MAX:1] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -IF k1[0] - k[0] := ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" based on the comparison operand specified by "imm8", and return the boolean result (0 or 1). [sae_note] - CASE (imm8[4:0]) OF -0: OP := _CMP_EQ_OQ -1: OP := _CMP_LT_OS -2: OP := _CMP_LE_OS -3: OP := _CMP_UNORD_Q -4: OP := _CMP_NEQ_UQ -5: OP := _CMP_NLT_US -6: OP := _CMP_NLE_US -7: OP := _CMP_ORD_Q -8: OP := _CMP_EQ_UQ -9: OP := _CMP_NGE_US -10: OP := _CMP_NGT_US -11: OP := _CMP_FALSE_OQ -12: OP := _CMP_NEQ_OQ -13: OP := _CMP_GE_OS -14: OP := _CMP_GT_OS -15: OP := _CMP_TRUE_UQ -16: OP := _CMP_EQ_OS -17: OP := _CMP_LT_OQ -18: OP := _CMP_LE_OQ -19: OP := _CMP_UNORD_S -20: OP := _CMP_NEQ_US -21: OP := _CMP_NLT_UQ -22: OP := _CMP_NLE_UQ -23: OP := _CMP_ORD_S -24: OP := _CMP_EQ_US -25: OP := _CMP_NGE_UQ -26: OP := _CMP_NGT_UQ -27: OP := _CMP_FALSE_OS -28: OP := _CMP_NEQ_OS -29: OP := _CMP_GE_OQ -30: OP := _CMP_GT_OQ -31: OP := _CMP_TRUE_US -ESAC -RETURN ( a.fp16[0] OP b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] == b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] < b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] <= b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] > b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] >= b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). - RETURN ( a.fp16[0] ==NaN OR b.fp16[0] ==NaN OR a.fp16[0] != b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] == b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] < b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] <= b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] > b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a.fp16[0] !=NaN AND b.fp16[0] !=NaN AND a.fp16[0] >= b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - - Compare the lower half-precision (16-bit) floating-point elements in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a.fp16[0] ==NaN OR b.fp16[0] ==NaN OR a.fp16[0] != b.fp16[0] ) ? 1 : 0 - - - AVX512_FP16 -
immintrin.h
- Compare -
- - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 31 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 31 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed signed 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 31 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 31 - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 16-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.fp16[j] := Convert_Int16_To_FP16(a.word[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 15 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 15 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed signed 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 15 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 15 - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 32-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.fp16[j] := Convert_Int32_To_FP16(a.dword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed signed 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed unsigned 64-bit integers in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_Int64_To_FP16(a.qword[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 TO 7 - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.fp16[j] := Convert_FP64_To_FP16(a.fp64[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst". - -dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper element of "dst". - [round_note] - -dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". - -IF k[0] - dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". - -IF k[0] - dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper element of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := Convert_FP64_To_FP16(b.fp64[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_note] - -FOR j := 0 to 15 - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp16[j] := Convert_FP32_To_FP16(a.fp32[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a half-precision (16-bit) floating-point elements, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := Convert_FP32_To_FP16(b.fp32[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_Int32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 TO 15 - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 32-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 15 - IF k[j] - dst.dword[j] := Convert_FP16_To_UInt32_Truncate(a.fp16[j]) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_Int64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 TO 7 - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := src.qword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 64-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 7 - IF k[j] - dst.qword[j] := Convert_FP16_To_UInt64_Truncate(a.fp16[j]) - ELSE - dst.qword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". - [round_note] - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_Int16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst". [sae_note] - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst". [sae_note] - -FOR j := 0 TO 31 - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := src.word[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed unsigned 16-bit integers with truncation, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 TO 31 - IF k[j] - dst.word[j] := Convert_FP16_To_UInt16_Truncate(a.fp16[j]) - ELSE - dst.word[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". [sae_note] - -FOR j := 0 to 7 - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := src.fp64[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := src.fp64[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 7 - IF k[j] - dst.fp64[j] := Convert_FP16_To_FP64(a.fp16[j]) - ELSE - dst.fp64[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 15 - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". [sae_note] - -FOR j := 0 to 15 - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := src.fp32[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := src.fp32[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note] - -FOR j := 0 to 15 - IF k[j] - dst.fp32[j] := Convert_FP16_To_FP32(a.fp16[j]) - ELSE - dst.fp32[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [sae_note] - -dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) -ELSE - dst.fp64[0] := src.fp64[0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] - -IF k[0] - dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) -ELSE - dst.fp64[0] := src.fp64[0] -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". - -IF k[0] - dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) -ELSE - dst.fp64[0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper element from "a" to the upper element of "dst". [sae_note] - -IF k[0] - dst.fp64[0] := Convert_FP16_To_FP64(b.fp16[0]) -ELSE - dst.fp64[0] := 0 -FI -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] - -dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) -ELSE - dst.fp32[0] := src.fp32[0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] - -IF k[0] - dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) -ELSE - dst.fp32[0] := src.fp32[0] -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) -ELSE - dst.fp32[0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - - Convert the lower half-precision (16-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from "a" to the upper elements of "dst". [sae_note] - -IF k[0] - dst.fp32[0] := Convert_FP16_To_FP32(b.fp16[0]) -ELSE - dst.fp32[0] := 0 -FI -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst.dword := Convert_FP16_To_Int32(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - [round_note] - -dst.dword := Convert_FP16_To_Int32(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst.qword := Convert_FP16_To_Int64(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - [round_note] - -dst.qword := Convert_FP16_To_Int64(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". [sae_note] - -dst.dword := Convert_FP16_To_Int32_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". [sae_note] - -dst.qword := Convert_FP16_To_Int64_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". - -dst.dword := Convert_FP16_To_UInt32(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer, and store the result in "dst". [sae_note] - -dst.dword := Convert_FP16_To_UInt32(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". - -dst.qword := Convert_FP16_To_UInt64(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer, and store the result in "dst". [round_note] - -dst.qword := Convert_FP16_To_UInt64(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". - -dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 32-bit integer with truncation, and store the result in "dst". [sae_note] - -dst.dword := Convert_FP16_To_UInt32_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". - -dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the lower half-precision (16-bit) floating-point element in "a" to an unsigned 64-bit integer with truncation, and store the result in "dst". [sae_note] - -dst.qword := Convert_FP16_To_UInt64_Truncate(a.fp16[0]) - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the signed 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the unsigned 32-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := Convert_Int32_To_FP16(b.fp32[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the signed 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - - Convert the unsigned 64-bit integer "b" to a half-precision (16-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := Convert_Int64_To_FP16(b.fp64[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Copy 16-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". - -dst.fp16[0] := a.fp16[0] -dst[MAX:16] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Copy the lower 16-bit integer in "a" to "dst". - -dst.fp16[0] := a.fp16[0] -dst[MAX:16] := 0 - - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". - -dst[15:0] := a.fp16[0] - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". - -dst[15:0] := a.fp16[0] - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - Copy the lower half-precision (16-bit) floating-point element of "a" to "dst". - -dst[15:0] := a.fp16[0] - - AVX512_FP16 -
immintrin.h
- Convert -
- - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [max_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [sae_note][max_float_note] - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][max_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][max_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] > b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [min_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [sae_note] [min_float_note] - -FOR j := 0 to 31 - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [sae_note][min_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := src.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - Compare packed half-precision (16-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [sae_note][min_float_note] - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := (a.fp16[j] < b.fp16[j] ? a.fp16[j] : b.fp16[j]) - ELSE - dst.fp16[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - - Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -IF k[0] - dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - - - Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -IF k[0] - dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -IF k[0] - dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - - - - - Extract the reduced argument of the lower half-precision (16-bit) floating-point element in "b" by the number of bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -IF k[0] - dst.fp16[0] := ReduceArgumentFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Special Math Functions -
- - - - Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst", and zero the upper elements. - -dst.fp16[0] := MEM[mem_addr].fp16[0] -dst[MAX:16] := 0 - - - AVX512_FP16 -
immintrin.h
- Load -
- - - - - - Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and set the upper elements of "dst" to zero. - -IF k[0] - dst.fp16[0] := MEM[mem_addr].fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[MAX:16] := 0 - - - AVX512_FP16 -
immintrin.h
- Load -
- - - - - Load a half-precision (16-bit) floating-point element from memory into the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and set the upper elements of "dst" to zero. - -IF k[0] - dst.fp16[0] := MEM[mem_addr].fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[MAX:16] := 0 - - - AVX512_FP16 -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Load -
- - - - Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[511:0] := MEM[mem_addr+511:mem_addr] -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Load -
- - - - - Store the lower half-precision (16-bit) floating-point element from "a" into memory. - -MEM[mem_addr].fp16[0] := a.fp16[0] - - - AVX512_FP16 -
immintrin.h
- Store -
- - - - - - Store the lower half-precision (16-bit) floating-point element from "a" into memory using writemask "k". - -IF k[0] - MEM[mem_addr].fp16[0] := a.fp16[0] -FI - - - AVX512_FP16 -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 64-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512_FP16 -
immintrin.h
- Store -
- - - - - Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+511:mem_addr] := a[511:0] - - - AVX512_FP16 -
immintrin.h
- Store -
- - - - - Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := b.fp16[0] -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Move -
- - - - - - - Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := b.fp16[0] -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Move -
- - - - - - Move the lower half-precision (16-bit) floating-point element from "b" to the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := b.fp16[0] -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Move -
- - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 31 - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) -ENDFOR -dest[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 31 - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) -ENDFOR -dest[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dest[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dest[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dest[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Round packed half-precision (16-bit) floating-point elements in "a" to the number of fraction bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := RoundScaleFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dest[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) -dst[127:16] := a[127:16] -dest[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) -dst[127:16] := a[127:16] -dest[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -IF k[0] - dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dest[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - - Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -IF k[0] - dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dest[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -IF k[0] - dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dest[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Round the lower half-precision (16-bit) floating-point element in "b" to the number of fraction bits specified by "imm8", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". [round_imm_note][sae_note] - -DEFINE RoundScaleFP16(src.fp16, imm8[7:0]) { - m.fp16 := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp.fp16 := POW(FP16(2.0), -m) * ROUND(POW(FP16(2.0), m) * src.fp16, imm8[3:0]) - RETURN tmp.fp16 -} -IF k[0] - dst.fp16[0] := RoundScaleFP16(b.fp16[0], imm8) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dest[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 31 - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] - FOR i := 0 to 31 - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] - FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of each packed half-precision (16-bit) floating-point element in "a" to a half-precision (16-bit) floating-point number representing the integer exponent, and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "floor(log2(x))" for each element. [sae_note] - FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ConvertExpFP16(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - dst.fp16[0] := ConvertExpFP16(b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] - dst.fp16[0] := ConvertExpFP16(b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst.fp16[0] := ConvertExpFP16(b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] - IF k[0] - dst.fp16[0] := ConvertExpFP16(b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. - IF k[0] - dst.fp16[0] := ConvertExpFP16(b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Convert the exponent of the lower half-precision (16-bit) floating-point element in "b" to a half-precision (16-bit) floating-point number representing the integer exponent, store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "floor(log2(x))" for the lower element. [sae_note] - IF k[0] - dst.fp16[0] := ConvertExpFP16(b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 31 - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note][sae_note] - FOR i := 0 TO 31 - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note][sae_note] - FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note][sae_note] - FOR i := 0 TO 31 - IF k[i] - dst.fp16[i] := GetNormalizedMantissaFP16(a.fp16[i], norm, sign) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note][sae_note] - dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - IF k[0] - dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - - - Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note] - IF k[0] - dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - - Normalize the mantissas of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". This intrinsic essentially calculates "±(2^k)*|x.significand|", where "k" depends on the interval range defined by "norm" and the sign depends on "sign" and the source sign. - [getmant_note][sae_note] - IF k[0] - dst.fp16[0] := GetNormalizedMantissaFP16(b.fp16[0], norm, sign) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 31 - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst". [round_imm_note][sae_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 31 - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Extract the reduced argument of packed half-precision (16-bit) floating-point elements in "a" by the number of bits specified by "imm8", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). [round_imm_note][sae_note] - -DEFINE ReduceArgumentFP16(src[15:0], imm8[7:0]) { - m[15:0] := FP16(imm8[7:4]) // number of fraction bits after the binary point to be preserved - tmp[15:0] := POW(2.0, FP16(-m)) * ROUND(POW(2.0, FP16(m)) * src[15:0], imm8[3:0]) - tmp[15:0] := src[15:0] - tmp[15:0] - IF IsInf(tmp[15:0]) - tmp[15:0] := FP16(0.0) - FI - RETURN tmp[15:0] -} -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := ReduceArgumentFP16(a.fp16[i], imm8) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst". - [round_note] - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed half-precision (16-bit) floating-point elements in "a" using values from "b", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -FOR i := 0 to 15 - IF k[i] - dst.fp16[i] := ScaleFP16(a.fp16[i], b.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -IF k[0] - dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -IF k[0] - dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -IF k[0] - dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - - Scale the packed single-precision (32-bit) floating-point elements in "a" using values from "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - DEFINE ScaleFP16(src1, src2) { - denormal1 := (a.exp == 0) and (a.fraction != 0) - denormal2 := (b.exp == 0) and (b.fraction != 0) - tmp1 := src1 - tmp2 := src2 - IF MXCSR.DAZ - IF denormal1 - tmp1 := 0 - FI - IF denormal2 - tmp2 := 0 - FI - FI - RETURN tmp1 * POW(2.0, FLOOR(tmp2)) -} -IF k[0] - dst.fp16[0] := ScaleFP16(a.fp16[0], b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k". - [fpclass_note] - FOR i := 0 to 31 - k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) -ENDFOR -k[MAX:32] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Test packed half-precision (16-bit) floating-point elements in "a" for special categories specified by "imm8", and store the results in mask vector "k" using zeromask "k1" (elements are zeroed out when the corresponding mask bit is not set). - [fpclass_note] - FOR i := 0 to 31 - IF k1[i] - k[i] := CheckFPClass_FP16(a.fp16[i], imm8[7:0]) - ELSE - k[i] := 0 - FI -ENDFOR -k[MAX:32] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k". - [fpclass_note] - k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0]) -k[MAX:1] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Test the lower half-precision (16-bit) floating-point element in "a" for special categories specified by "imm8", and store the result in mask vector "k" using zeromask "k1" (the element is zeroed out when mask bit 0 is not set). - [fpclass_note] - IF k1[0] - k[0] := CheckFPClass_FP16(a.fp16[0], imm8[7:0]) -ELSE - k[0] := 0 -FI -k[MAX:1] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Shuffle half-precision (16-bit) floating-point elements in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - off := idx[i+4:i] - dst.fp16[j] := idx[i+5] ? b.fp16[off] : a.fp16[off] -ENDFOR -dst[MAX:512] := 0 - - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - - Blend packed half-precision (16-bit) floating-point elements from "a" and "b" using control mask "k", and store the results in "dst". - -FOR j := 0 to 31 - IF k[j] - dst.fp16[j] := b.fp16[j] - ELSE - dst.fp16[j] := a.fp16[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - - Shuffle half-precision (16-bit) floating-point elements in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 31 - i := j*16 - id := idx[i+4:i] - dst.fp16[j] := a.fp16[id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Miscellaneous -
- - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 31 - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := (1.0 / SQRT(a.fp16[i])) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -IF k[0] - dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -IF k[0] - dst.fp16[0] := (1.0 / SQRT(b.fp16[0])) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - -FOR i := 0 to 31 - dst.fp16[i] := SQRT(a.fp16[i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". - [round_note] - -FOR i := 0 to 31 - dst.fp16[i] := SQRT(a.fp16[i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - [round_note] - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - [round_note] - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := SQRT(a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -dst.fp16[0] := SQRT(b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst.fp16[0] := SQRT(b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := SQRT(b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - - Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := SQRT(b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - -IF k[0] - dst.fp16[0] := SQRT(b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the square root of the lower half-precision (16-bit) floating-point element in "b", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". - [round_note] - -IF k[0] - dst.fp16[0] := SQRT(b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 31 - dst.fp16[i] := (1.0 / a.fp16[i]) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := (1.0 / a.fp16[i]) - ELSE - dst.fp16[i] := src.fp16[i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of packed half-precision (16-bit) floating-point elements in "a", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR i := 0 to 31 - IF k[i] - dst.fp16[i] := (1.0 / a.fp16[i]) - ELSE - dst.fp16[i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -dst.fp16[0] := (1.0 / b.fp16[0]) -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using writemask "k" (the element is copied from "src" when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -IF k[0] - dst.fp16[0] := (1.0 / b.fp16[0]) -ELSE - dst.fp16[0] := src.fp16[0] -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in "a", store the result in the lower element of "dst" using zeromask "k" (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -IF k[0] - dst.fp16[0] := (1.0 / b.fp16[0]) -ELSE - dst.fp16[0] := 0 -FI -dst[127:16] := a[127:16] -dst[MAX:128] := 0 - - - AVX512_FP16 -
immintrin.h
- Elementary Math Functions -
- - - - - - - - - - - Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. - -dst.fp16[0] := e0 -dst.fp16[1] := e1 -dst.fp16[2] := e2 -dst.fp16[3] := e3 -dst.fp16[4] := e4 -dst.fp16[5] := e5 -dst.fp16[6] := e6 -dst.fp16[7] := e7 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. - -dst.fp16[0] := e0 -dst.fp16[1] := e1 -dst.fp16[2] := e2 -dst.fp16[3] := e3 -dst.fp16[4] := e4 -dst.fp16[5] := e5 -dst.fp16[6] := e6 -dst.fp16[7] := e7 -dst.fp16[8] := e8 -dst.fp16[9] := e9 -dst.fp16[10] := e10 -dst.fp16[11] := e11 -dst.fp16[12] := e12 -dst.fp16[13] := e13 -dst.fp16[14] := e14 -dst.fp16[15] := e15 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values. - -dst.fp16[0] := e0 -dst.fp16[1] := e1 -dst.fp16[2] := e2 -dst.fp16[3] := e3 -dst.fp16[4] := e4 -dst.fp16[5] := e5 -dst.fp16[6] := e6 -dst.fp16[7] := e7 -dst.fp16[8] := e8 -dst.fp16[9] := e9 -dst.fp16[10] := e10 -dst.fp16[11] := e11 -dst.fp16[12] := e12 -dst.fp16[13] := e13 -dst.fp16[14] := e14 -dst.fp16[15] := e15 -dst.fp16[16] := e16 -dst.fp16[17] := e17 -dst.fp16[18] := e18 -dst.fp16[19] := e19 -dst.fp16[20] := e20 -dst.fp16[21] := e21 -dst.fp16[22] := e22 -dst.fp16[23] := e23 -dst.fp16[24] := e24 -dst.fp16[25] := e25 -dst.fp16[26] := e26 -dst.fp16[27] := e27 -dst.fp16[28] := e28 -dst.fp16[29] := e29 -dst.fp16[30] := e30 -dst.fp16[31] := e31 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - - - - - - - - Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst.fp16[0] := e7 -dst.fp16[1] := e6 -dst.fp16[2] := e5 -dst.fp16[3] := e4 -dst.fp16[4] := e3 -dst.fp16[5] := e2 -dst.fp16[6] := e1 -dst.fp16[7] := e0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst.fp16[0] := e15 -dst.fp16[1] := e14 -dst.fp16[2] := e13 -dst.fp16[3] := e12 -dst.fp16[4] := e11 -dst.fp16[5] := e10 -dst.fp16[6] := e9 -dst.fp16[7] := e8 -dst.fp16[8] := e7 -dst.fp16[9] := e6 -dst.fp16[10] := e5 -dst.fp16[11] := e4 -dst.fp16[12] := e3 -dst.fp16[13] := e2 -dst.fp16[14] := e1 -dst.fp16[15] := e0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set packed half-precision (16-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst.fp16[0] := e31 -dst.fp16[1] := e30 -dst.fp16[2] := e29 -dst.fp16[3] := e28 -dst.fp16[4] := e27 -dst.fp16[5] := e26 -dst.fp16[6] := e25 -dst.fp16[7] := e24 -dst.fp16[8] := e23 -dst.fp16[9] := e22 -dst.fp16[10] := e21 -dst.fp16[11] := e20 -dst.fp16[12] := e19 -dst.fp16[13] := e18 -dst.fp16[14] := e17 -dst.fp16[15] := e16 -dst.fp16[16] := e15 -dst.fp16[17] := e14 -dst.fp16[18] := e13 -dst.fp16[19] := e12 -dst.fp16[20] := e11 -dst.fp16[21] := e10 -dst.fp16[22] := e9 -dst.fp16[23] := e8 -dst.fp16[24] := e7 -dst.fp16[25] := e6 -dst.fp16[26] := e5 -dst.fp16[27] := e4 -dst.fp16[28] := e3 -dst.fp16[29] := e2 -dst.fp16[30] := e1 -dst.fp16[31] := e0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". - -FOR i := 0 to 7 - dst.fp16[i] := a[15:0] -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". - -FOR i := 0 to 15 - dst.fp16[i] := a[15:0] -ENDFOR -dst[MAX:256] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Broadcast half-precision (16-bit) floating-point value "a" to all elements of "dst". - -FOR i := 0 to 31 - dst.fp16[i] := a[15:0] -ENDFOR -dst[MAX:512] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst". - -FOR i := 0 to 3 - dst.fp16[2*i+0] := a[15:0] - dst.fp16[2*i+1] := a[31:16] -ENDFOR -dst[MAX:128] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst". - -FOR i := 0 to 7 - dst.fp16[2*i+0] := a[15:0] - dst.fp16[2*i+1] := a[31:16] -ENDFOR -dst[MAX:256] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Broadcast half-precision (16-bit) complex floating-point value "a" to all elements of "dst". - -FOR i := 0 to 15 - dst.fp16[2*i+0] := a[15:0] - dst.fp16[2*i+1] := a[31:16] -ENDFOR -dst[MAX:512] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Copy half-precision (16-bit) floating-point element "a" to the lower element of "dst", and zero the upper 7 elements. - -dst.fp16[0] := a[15:0] -dst[127:16] := 0 - - AVX512_FP16 -
immintrin.h
- Set -
- - - Return vector of type __m512h with all elements set to zero. - -dst[MAX:0] := 0 - - - AVX512_FP16 -
immintrin.h
- Set -
- - - - Cast vector of type "__m128h" to type "__m128". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256h" to type "__m256". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512h" to type "__m512". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128h" to type "__m128d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256h" to type "__m256d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512h" to type "__m512d". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128h" to type "__m128i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256h" to type "__m256i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512h" to type "__m512i". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128d" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256d" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512d" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128i" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256i" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512i" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512h" to type "__m128h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m512h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128h" to type "__m256h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256h" to type "__m512h". This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128h" to type "__m256h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m128h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - - Cast vector of type "__m256h" to type "__m512h"; the upper 128 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - AVX512_FP16 -
immintrin.h
- Cast -
- - - Return vector of type __m512h with undefined elements. - AVX512_FP16 -
immintrin.h
- General Support -
- - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". - -FOR i := 0 to 3 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 to 3 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] - FI - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 3 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". - -FOR i := 0 to 1 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 to 1 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] - FI - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 1 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Bit Manipulation -
- - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - id := idx[i+4:i]*8 - dst[i+7:i] := a[id+7:id] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - id := idx[i+4:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - id := idx[i+4:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - id := idx[i+3:i]*8 - dst[i+7:i] := a[id+7:id] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - id := idx[i+3:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Shuffle 8-bit integers in "a" using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - id := idx[i+3:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 31 - i := j*8 - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := idx[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*8 - IF k[j] - off := 8*idx[i+4:i] - dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := idx[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*8 - IF k[j] - off := 8*idx[i+3:i] - dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - - AVX512_VBMI - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst". - -FOR i := 0 to 7 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Bit Manipulation -
- - - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR i := 0 to 7 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8] - FI - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Bit Manipulation -
- - - - - - For each 64-bit element in "b", select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of "a", and store the 8 assembled bytes to the corresponding 64-bit element of "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR i := 0 to 7 - q := i * 64 - FOR j := 0 to 7 - tmp8 := 0 - ctrl := a[q+j*8+7:q+j*8] & 63 - FOR l := 0 to 7 - tmp8[l] := b[q+((ctrl+l) & 63)] - ENDFOR - IF k[i*8+j] - dst[q+j*8+7:q+j*8] := tmp8[7:0] - ELSE - dst[q+j*8+7:q+j*8] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Bit Manipulation -
- - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - id := idx[i+5:i]*8 - dst[i+7:i] := a[id+7:id] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - id := idx[i+5:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - Shuffle 8-bit integers in "a" across lanes using the corresponding index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - id := idx[i+5:i]*8 - IF k[j] - dst[i+7:i] := a[id+7:id] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst". - -FOR j := 0 to 63 - i := j*8 - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := idx[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - - Shuffle 8-bit integers in "a" and "b" across lanes using the corresponding selector and index in "idx", and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 63 - i := j*8 - IF k[j] - off := 8*idx[i+5:i] - dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off] - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - - AVX512_VBMI -
immintrin.h
- Swizzle -
- - - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". - -FOR j := 0 to 15 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". - -FOR j := 0 to 7 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". - -FOR j := 0 to 3 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". - -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 1 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". - -FOR j := 0 to 7 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". - -FOR j := 0 to 3 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). - -FOR j := 0 to 15 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Shift -
- - Swizzle - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Load -
- - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 16 -m := 0 -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 16 -m := 0 -FOR j := 0 to 15 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 16 -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 16 -m := 0 -FOR j := 0 to 7 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 8 -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR -dst[255:m] := 0 -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 8 -m := 0 -FOR j := 0 to 31 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR -dst[255:m] := src[255:m] -dst[MAX:256] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 8 -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR -dst[127:m] := 0 -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 8 -m := 0 -FOR j := 0 to 15 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR -dst[127:m] := src[127:m] -dst[MAX:128] := 0 - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Swizzle -
- - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 16 -m := base_addr -FOR j := 0 to 15 - i := j*16 - IF k[j] - MEM[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 16 -m := base_addr -FOR j := 0 to 7 - i := j*16 - IF k[j] - MEM[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 8 -m := base_addr -FOR j := 0 to 31 - i := j*8 - IF k[j] - MEM[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 8 -m := base_addr -FOR j := 0 to 15 - i := j*8 - IF k[j] - MEM[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR - - - AVX512_VBMI2 - AVX512VL -
immintrin.h
- Store -
- - - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 64-bits in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> (c[i+63:i] & 63) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 32-bits in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> (c[i+31:i] & 31) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of "c", and store the lower 16-bits in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> (c[i+15:i] & 15) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "b" and "a" producing an intermediate 128-bit result. Shift the result right by "imm8" bits, and store the lower 64-bits in "dst". - -FOR j := 0 to 7 - i := j*64 - dst[i+63:i] := ((b[i+63:i] << 64)[127:0] | a[i+63:i]) >> imm8[5:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "b" and "a" producing an intermediate 64-bit result. Shift the result right by "imm8" bits, and store the lower 32-bits in "dst". - -FOR j := 0 to 15 - i := j*32 - dst[i+31:i] := ((b[i+31:i] << 32)[63:0] | a[i+31:i]) >> imm8[4:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "b" and "a" producing an intermediate 32-bit result. Shift the result right by "imm8" bits, and store the lower 16-bits in "dst". - -FOR j := 0 to 31 - i := j*16 - dst[i+15:i] := ((b[i+15:i] << 16)[31:0] | a[i+15:i]) >> imm8[3:0] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 64-bits in "dst". - -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << (c[i+63:i] & 63) - dst[i+63:i] := tmp[127:64] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 32-bits in "dst". - -FOR j := 0 to 15 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << (c[i+31:i] & 31) - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "a" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of "c", and store the upper 16-bits in "dst". - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << (c[i+15:i] & 15) - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - i := j*64 - IF k[j] - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] - ELSE - dst[i+63:i] := src[i+63:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 64-bit integers in "a" and "b" producing an intermediate 128-bit result. Shift the result left by "imm8" bits, and store the upper 64-bits in "dst"). - -FOR j := 0 to 7 - i := j*64 - tmp[127:0] := ((a[i+63:i] << 64)[127:0] | b[i+63:i]) << imm8[5:0] - dst[i+63:i] := tmp[127:64] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - i := j*32 - IF k[j] - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] - ELSE - dst[i+31:i] := src[i+31:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 32-bit integers in "a" and "b" producing an intermediate 64-bit result. Shift the result left by "imm8" bits, and store the upper 32-bits in "dst". - -FOR j := 0 to 15 - i := j*32 - tmp[63:0] := ((a[i+31:i] << 32)[63:0] | b[i+31:i]) << imm8[4:0] - dst[i+31:i] := tmp[63:32] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 31 - i := j*16 - IF k[j] - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - - - - - Concatenate packed 16-bit integers in "a" and "b" producing an intermediate 32-bit result. Shift the result left by "imm8" bits, and store the upper 16-bits in "dst"). - -FOR j := 0 to 31 - i := j*16 - tmp[31:0] := ((a[i+15:i] << 16)[31:0] | b[i+15:i]) << imm8[3:0] - dst[i+15:i] := tmp[31:16] -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Shift -
- - Swizzle - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 16-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := MEM[mem_addr+m+15:mem_addr+m] - m := m + 16 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Load -
- - Swizzle - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Load -
- - Swizzle - - - - - Load contiguous active 8-bit integers from unaligned memory at "mem_addr" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := MEM[mem_addr+m+7:mem_addr+m] - m := m + 8 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Load -
- - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 - ELSE - dst[i+15:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 16-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[i+15:i] := a[m+15:m] - m := m + 16 - ELSE - dst[i+15:i] := src[i+15:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 - ELSE - dst[i+7:i] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - - Load contiguous active 8-bit integers from "a" (those with their respective bit set in mask "k"), and store the results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[i+7:i] := a[m+7:m] - m := m + 8 - ELSE - dst[i+7:i] := src[i+7:i] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 16 -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 16 -m := 0 -FOR j := 0 to 31 - i := j*16 - IF k[j] - dst[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in zeromask "k") to "dst", and set the remaining elements to zero. - -size := 8 -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR -dst[511:m] := 0 -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to "dst", and pass through the remaining elements from "src". - -size := 8 -m := 0 -FOR j := 0 to 63 - i := j*8 - IF k[j] - dst[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR -dst[511:m] := src[511:m] -dst[MAX:512] := 0 - - - AVX512_VBMI2 -
immintrin.h
- Swizzle -
- - Swizzle - - - - - Contiguously store the active 16-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 16 -m := base_addr -FOR j := 0 to 31 - i := j*16 - IF k[j] - MEM[m+size-1:m] := a[i+15:i] - m := m + size - FI -ENDFOR - - - AVX512_VBMI2 -
immintrin.h
- Store -
- - Swizzle - - - - - Contiguously store the active 8-bit integers in "a" (those with their respective bit set in writemask "k") to unaligned memory at "base_addr". - -size := 8 -m := base_addr -FOR j := 0 to 63 - i := j*8 - IF k[j] - MEM[m+size-1:m] := a[i+7:i] - m := m + size - FI -ENDFOR - - - AVX512_VBMI2 -
immintrin.h
- Store -
- - - - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 7 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:256] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 3 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:128] := 0 - - - AVX512_VNNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 15 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 15 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 15 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -FOR j := 0 to 15 - IF k[j] - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 - ELSE - dst.dword[j] := src.dword[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 15 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:512] := 0 - - - AVX512_VNNI -
immintrin.h
- Arithmetic -
- - - - - - - - - Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+15:k1] := 0 -MEM[k2+15:k2] := 0 -FOR i := 0 TO 15 - FOR j := 0 TO 15 - match := (a.dword[i] == b.dword[j] ? 1 : 0) - MEM[k1+15:k1].bit[i] |= match - MEM[k2+15:k2].bit[j] |= match - ENDFOR -ENDFOR - - - AVX512_VP2INTERSECT - AVX512F -
immintrin.h
- Mask -
- - - - - - - Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 7 - FOR j := 0 TO 7 - match := (a.qword[i] == b.qword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR -ENDFOR - - - AVX512_VP2INTERSECT - AVX512F -
immintrin.h
- Mask -
- - - - - - - - - Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 3 - FOR j := 0 TO 3 - match := (a.dword[i] == b.dword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR -ENDFOR - - - AVX512_VP2INTERSECT - AVX512VL -
immintrin.h
- Mask -
- - - - - - - Compute intersection of packed 32-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 7 - FOR j := 0 TO 7 - match := (a.dword[i] == b.dword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR -ENDFOR - - - AVX512_VP2INTERSECT - AVX512VL -
immintrin.h
- Mask -
- - - - - - - Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 1 - FOR j := 0 TO 1 - match := (a.qword[i] == b.qword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR -ENDFOR - - - AVX512_VP2INTERSECT - AVX512VL -
immintrin.h
- Mask -
- - - - - - - Compute intersection of packed 64-bit integer vectors "a" and "b", and store indication of match in the corresponding bit of two mask registers specified by "k1" and "k2". A match in corresponding elements of "a" and "b" is indicated by a set bit in the corresponding bit of the mask registers. - -MEM[k1+7:k1] := 0 -MEM[k2+7:k2] := 0 -FOR i := 0 TO 3 - FOR j := 0 TO 3 - match := (a.qword[i] == b.qword[j] ? 1 : 0) - MEM[k1+7:k1].bit[i] |= match - MEM[k2+7:k2].bit[j] |= match - ENDFOR -ENDFOR - - - AVX512_VP2INTERSECT - AVX512VL -
immintrin.h
- Mask -
- - - - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 3 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52]) -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - Multiply packed unsigned 52-bit integers in each 64-bit element of "__Y" and "__Z" to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in "__X", and store the results in "dst". - - -FOR j := 0 to 1 - i := j*64 - tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i]) - dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0]) -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_IFMA -
immintrin.h
- Arithmetic -
- - - - Convert scalar BF16 (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -b := Convert_BF16_To_FP32(MEM[__A+15:__A]) -FOR j := 0 to 7 - m := j*32 - dst[m+31:m] := b -ENDFOR -dst[MAX:256] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -b := Convert_FP16_To_FP32(MEM[__A+15:__A]) -FOR j := 0 to 7 - m := j*32 - dst[m+31:m] := b -ENDFOR -dst[MAX:256] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 7 - m := j*32 - dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+15:__A+m]) -ENDFOR -dst[MAX:256] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 7 - m := j*32 - dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+15:__A+m]) -ENDFOR -dst[MAX:256] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 7 - m := j*32 - dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+31:__A+m+16]) -ENDFOR -dst[MAX:256] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 7 - m := j*32 - dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+31:__A+m+16]) -ENDFOR -dst[MAX:256] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 7 - dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert scalar BF16 (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -b := Convert_BF16_To_FP32(MEM[__A+15:__A]) -FOR j := 0 to 3 - m := j*32 - dst[m+31:m] := b -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting at location "__A" to a single-precision (32-bit) floating-point, broadcast it to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -b := Convert_FP16_To_FP32(MEM[__A+15:__A]) -FOR j := 0 to 3 - m := j*32 - dst[m+31:m] := b -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 3 - m := j*32 - dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+15:__A+m]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 3 - m := j*32 - dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+15:__A+m]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 3 - m := j*32 - dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+m+31:__A+m+16]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at location "__A" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 3 - m := j*32 - dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+m+31:__A+m+16]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 3 - dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 7 - dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - Convert packed single-precision (32-bit) floating-point elements in "__A" to packed BF16 (16-bit) floating-point elements, and store the results in "dst". - - -FOR j := 0 to 3 - dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j]) -ENDFOR -dst[MAX:128] := 0 - - - AVX_NE_CONVERT -
immintrin.h
- Convert -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:256] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "a" with corresponding signed 8-bit integers in "b", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.word := Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j])) - tmp2.word := Signed(ZeroExtend16(a.byte[4*j+1]) * SignExtend16(b.byte[4*j+1])) - tmp3.word := Signed(ZeroExtend16(a.byte[4*j+2]) * SignExtend16(b.byte[4*j+2])) - tmp4.word := Signed(ZeroExtend16(a.byte[4*j+3]) * SignExtend16(b.byte[4*j+3])) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src", and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := src.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "a" with corresponding 16-bit integers in "b", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "src" using signed saturation, and store the packed 32-bit results in "dst". - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(a.word[2*j]) * SignExtend32(b.word[2*j]) - tmp2.dword := SignExtend32(a.word[2*j+1]) * SignExtend32(b.word[2*j+1]) - dst.dword[j] := Saturate32(src.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:128] := 0 - - - AVX_VNNI -
immintrin.h
- Arithmetic -
- - - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:256] := 0 - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:256] := 0 - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:256] := 0 - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of signed 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:128] := 0 - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding signed 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:128] := 0 - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in "__A" with corresponding unsigned 16-bit integers in "__B", producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) - tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) - dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) -ENDFOR -dst[MAX:128] := 0 - - - - AVX_VNNI_INT16 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) - tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) - tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) - tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) - tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) - tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) - tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:256] := 0 - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) - tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) - tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) - tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) - tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) - tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) - tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:256] := 0 - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) - tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) - tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) - tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:256] := 0 - - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with unsigned saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 7 - tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) - tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) - tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) - tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) - dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:256] := 0 - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) - tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) - tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) - tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding signed 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) - tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) - tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) - tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:128] := 0 - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) - tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) - tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) - tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of signed 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with signed saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) - tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) - tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) - tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) - dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:128] := 0 - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W", and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) - tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) - tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) - tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) - dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -ENDFOR -dst[MAX:128] := 0 - - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in "__A" with corresponding unsigned 8-bit integers in "__B", producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in "__W" with unsigned saturation, and store the packed 32-bit results in "dst". - - -FOR j := 0 to 3 - tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) - tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) - tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) - tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) - dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -ENDFOR -dst[MAX:128] := 0 - - - - AVX_VNNI_INT8 -
immintrin.h
- Arithmetic -
- - - - - - - Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". - -tmp[511:0] := a -dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - - Extract contiguous bits from unsigned 32-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control". - -start := control[7:0] -len := control[15:8] -tmp[511:0] := a -dst[31:0] := ZeroExtend32(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - - - Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by "len", starting at the bit specified by "start". - -tmp[511:0] := a -dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - - Extract contiguous bits from unsigned 64-bit integer "a", and store the result in "dst". Extract the number of bits specified by bits 15:8 of "control", starting at the bit specified by bits 0:7 of "control".. - -start := control[7:0] -len := control[15:8] -tmp[511:0] := a -dst[63:0] := ZeroExtend64(tmp[(start[7:0] + len[7:0] - 1):start[7:0]]) - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Extract the lowest set bit from unsigned 32-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". - -dst := (-a) AND a - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Extract the lowest set bit from unsigned 64-bit integer "a" and set the corresponding bit in "dst". All other bits in "dst" are zeroed, and all bits are zeroed if no bits are set in "a". - -dst := (-a) AND a - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 32-bit integer "a". - -dst := (a - 1) XOR a - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Set all the lower bits of "dst" up to and including the lowest set bit in unsigned 64-bit integer "a". - -dst := (a - 1) XOR a - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". - -dst := (a - 1) AND a - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the bit in "dst" that corresponds to the lowest set bit in "a". - -dst := (a - 1) AND a - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - - Compute the bitwise NOT of 32-bit integer "a" and then AND with b, and store the results in dst. - -dst[31:0] := ((NOT a[31:0]) AND b[31:0]) - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - - Compute the bitwise NOT of 64-bit integer "a" and then AND with b, and store the results in dst. - -dst[63:0] := ((NOT a[63:0]) AND b[63:0]) - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Count the number of trailing zero bits in unsigned 16-bit integer "a", and return that count in "dst". - -tmp := 0 -dst := 0 -DO WHILE ((tmp < 16) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". - -tmp := 0 -dst := 0 -DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". - -tmp := 0 -dst := 0 -DO WHILE ((tmp < 64) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Count the number of trailing zero bits in unsigned 32-bit integer "a", and return that count in "dst". - -tmp := 0 -dst := 0 -DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - Count the number of trailing zero bits in unsigned 64-bit integer "a", and return that count in "dst". - -tmp := 0 -dst := 0 -DO WHILE ((tmp < 64) AND a[tmp] == 0) - tmp := tmp + 1 - dst := dst + 1 -OD - - - BMI1 -
immintrin.h
- Bit Manipulation -
- - - - - - - Copy all bits from unsigned 32-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". - -n := index[7:0] -dst := a -IF (n < 32) - dst[31:n] := 0 -FI - - - BMI2 -
immintrin.h
- Bit Manipulation -
- - - - - Copy all bits from unsigned 64-bit integer "a" to "dst", and reset (set to 0) the high bits in "dst" starting at "index". - -n := index[7:0] -dst := a -IF (n < 64) - dst[63:n] := 0 -FI - - - BMI2 -
immintrin.h
- Bit Manipulation -
- - - - - Deposit contiguous low bits from unsigned 32-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. - -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 32 - IF mask[m] == 1 - dst[m] := tmp[k] - k := k + 1 - FI - m := m + 1 -OD - - - BMI2 -
immintrin.h
- Bit Manipulation -
- - - - - Deposit contiguous low bits from unsigned 64-bit integer "a" to "dst" at the corresponding bit locations specified by "mask"; all other bits in "dst" are set to zero. - -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 64 - IF mask[m] == 1 - dst[m] := tmp[k] - k := k + 1 - FI - m := m + 1 -OD - - - BMI2 -
immintrin.h
- Bit Manipulation -
- - - - - Extract bits from unsigned 32-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. - -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 32 - IF mask[m] == 1 - dst[k] := tmp[m] - k := k + 1 - FI - m := m + 1 -OD - - - BMI2 -
immintrin.h
- Bit Manipulation -
- - - - - Extract bits from unsigned 64-bit integer "a" at the corresponding bit locations specified by "mask" to contiguous low bits in "dst"; the remaining upper bits in "dst" are set to zero. - -tmp := a -dst := 0 -m := 0 -k := 0 -DO WHILE m < 64 - IF mask[m] == 1 - dst[k] := tmp[m] - k := k + 1 - FI - m := m + 1 -OD - - - BMI2 -
immintrin.h
- Bit Manipulation -
- - - - - - Multiply unsigned 32-bit integers "a" and "b", store the low 32-bits of the result in "dst", and store the high 32-bits in "hi". This does not read or write arithmetic flags. - -dst[31:0] := (a * b)[31:0] -MEM[hi+31:hi] := (a * b)[63:32] - - - BMI2 -
immintrin.h
- Arithmetic -
- - - - - - Multiply unsigned 64-bit integers "a" and "b", store the low 64-bits of the result in "dst", and store the high 64-bits in "hi". This does not read or write arithmetic flags. - -dst[63:0] := (a * b)[63:0] -MEM[hi+63:hi] := (a * b)[127:64] - - - BMI2 -
immintrin.h
- Arithmetic -
- - - - - - Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". - -SSP := SSP + a[7:0] * 4 - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Increment the shadow stack pointer by 8 times the value specified in bits [7:0] of "a". - -SSP := SSP + a[7:0] * 8 - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Read the low 32-bits of the current shadow stack pointer, and store the result in "dst". - dst := SSP[31:0] - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Read the current shadow stack pointer, and store the result in "dst". - dst := SSP[63:0] - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Save the previous shadow stack pointer context. - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Restore the saved shadow stack pointer from the shadow stack restore token previously created on shadow stack by saveprevssp. - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - - Write 32-bit value in "val" to a shadow stack page in memory specified by "p". - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - - Write 64-bit value in "val" to a shadow stack page in memory specified by "p". - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - - Write 32-bit value in "val" to a user shadow stack page in memory specified by "p". - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - - Write 64-bit value in "val" to a user shadow stack page in memory specified by "p". - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Mark shadow stack pointed to by IA32_PL0_SSP as busy. - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Mark shadow stack pointed to by "p" as not busy. - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - If CET is enabled, read the low 32-bits of the current shadow stack pointer, and store the result in "dst". Otherwise return 0. - dst := SSP[31:0] - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - If CET is enabled, read the current shadow stack pointer, and store the result in "dst". Otherwise return 0. - dst := SSP[63:0] - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - Increment the shadow stack pointer by 4 times the value specified in bits [7:0] of "a". - -SSP := SSP + a[7:0] * 4 - - - CET_SS -
immintrin.h
- Miscellaneous -
- - - - - Hint to hardware that the cache line that contains "p" should be demoted from the cache closest to the processor core to a level more distant from the processor core. - - CLDEMOTE -
immintrin.h
- Miscellaneous -
- - - - - - Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. - - CLFLUSHOPT -
immintrin.h
- General Support -
- - - - - - Write back to memory the cache line that contains "p" from any level of the cache hierarchy in the cache coherence domain. - - CLWB -
immintrin.h
- General Support -
- - - - - - - - - Compares the value from the memory "__A" with the value of "__B". If the specified condition "__D" is met, then add the third operand "__C" to the "__A" and write it into "__A", else the value of "__A" is unchanged. The return value is the original value of "__A". - CASE (__D[3:0]) OF -0: OP := _CMPCCX_O -1: OP := _CMPCCX_NO -2: OP := _CMPCCX_B -3: OP := _CMPCCX_NB -4: OP := _CMPCCX_Z -5: OP := _CMPCCX_NZ -6: OP := _CMPCCX_BE -7: OP := _CMPCCX_NBE -8: OP := _CMPCCX_S -9: OP := _CMPCCX_NS -10: OP := _CMPCCX_P -11: OP := _CMPCCX_NP -12: OP := _CMPCCX_L -13: OP := _CMPCCX_NL -14: OP := _CMPCCX_LE -15: OP := _CMPCCX_NLE -ESAC -tmp1 := LOAD_LOCK(__A) -tmp2 := tmp1 + __C -IF (tmp1[31:0] OP __B[31:0]) - STORE_UNLOCK(__A, tmp2) -ELSE - STORE_UNLOCK(__A, tmp1) -FI -dst[31:0] := tmp1[31:0] - - - - - - - - - - - - - - - - - - CMPCCXADD -
immintrin.h
- Arithmetic -
- - - - - - - Compares the value from the memory "__A" with the value of "__B". If the specified condition "__D" is met, then add the third operand "__C" to the "__A" and write it into "__A", else the value of "__A" is unchanged. The return value is the original value of "__A". - CASE (__D[3:0]) OF -0: OP := _CMPCCX_O -1: OP := _CMPCCX_NO -2: OP := _CMPCCX_B -3: OP := _CMPCCX_NB -4: OP := _CMPCCX_Z -5: OP := _CMPCCX_NZ -6: OP := _CMPCCX_BE -7: OP := _CMPCCX_NBE -8: OP := _CMPCCX_S -9: OP := _CMPCCX_NS -10: OP := _CMPCCX_P -11: OP := _CMPCCX_NP -12: OP := _CMPCCX_L -13: OP := _CMPCCX_NL -14: OP := _CMPCCX_LE -15: OP := _CMPCCX_NLE -ESAC -tmp1 := LOAD_LOCK(__A) -tmp2 := tmp1 + __C -IF (tmp1[63:0] OP __B[63:0]) - STORE_UNLOCK(__A, tmp2) -ELSE - STORE_UNLOCK(__A, tmp1) -FI -dst[63:0] := tmp1[63:0] - - - - - - - - - - - - - - - - - - CMPCCXADD -
immintrin.h
- Arithmetic -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst". - tmp1[7:0] := v[0:7] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[39:0] := tmp1[7:0] << 32 -tmp4[39:0] := tmp2[31:0] << 8 -tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0] -tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - CRC32 -
nmmintrin.h
- Cryptography -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst". - tmp1[15:0] := v[0:15] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[47:0] := tmp1[15:0] << 32 -tmp4[47:0] := tmp2[31:0] << 16 -tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0] -tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - CRC32 -
nmmintrin.h
- Cryptography -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst". - tmp1[31:0] := v[0:31] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[63:0] := tmp1[31:0] << 32 -tmp4[63:0] := tmp2[31:0] << 32 -tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0] -tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - CRC32 -
nmmintrin.h
- Cryptography -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst". - tmp1[63:0] := v[0:63] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[95:0] := tmp1[31:0] << 32 -tmp4[95:0] := tmp2[63:0] << 64 -tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0] -tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - CRC32 -
nmmintrin.h
- Cryptography -
- - - - - - - Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst". This intrinsics may only be used in User mode. - - ENQCMD -
immintrin.h
- Unknown -
- - - - - Reads 64-byte command pointed by "__src", formats 64-byte enqueue store data, and performs 64-byte enqueue store to memory pointed by "__dst" This intrinsic may only be used in Privileged mode. - - ENQCMD -
immintrin.h
- Unknown -
- - - - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:256] := 0 - - - F16C -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_imm_note] - -FOR j := 0 to 7 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) -ENDFOR -dst[MAX:128] := 0 - - - F16C -
immintrin.h
- Convert -
- - - - Convert packed half-precision (16-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - m := j*16 - dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m]) -ENDFOR -dst[MAX:128] := 0 - - - F16C -
immintrin.h
- Convert -
- - - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed half-precision (16-bit) floating-point elements, and store the results in "dst". - [round_imm_note] - -FOR j := 0 to 3 - i := 16*j - l := 32*j - dst[i+15:i] := Convert_FP32_To_FP16(a[l+31:l]) -ENDFOR -dst[MAX:64] := 0 - - - F16C -
immintrin.h
- Convert -
- - - - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := (a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively add and subtract packed elements in "c" to/from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := (a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i] - ELSE - dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", alternatively subtract and add packed elements in "c" from/to the intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i] - ELSE - dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i] - FI -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", add the negated intermediate result to packed elements in "c", and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and add the negated intermediate result to the lower element in "c". Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*64 - dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", subtract packed elements in "c" from the negated intermediate result, and store the results in "dst". - -FOR j := 0 to 7 - i := j*32 - dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i] -ENDFOR -dst[MAX:256] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower double-precision (64-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0] -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - - Multiply the lower single-precision (32-bit) floating-point elements in "a" and "b", and subtract the lower element in "c" from the negated intermediate result. Store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0] -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - - - FMA -
immintrin.h
- Arithmetic -
- - - - - Read the FS segment base register and store the 32-bit result in "dst". - dst[31:0] := FS_Segment_Base_Register -dst[63:32] := 0 - - - FSGSBASE -
immintrin.h
- General Support -
- - - Read the FS segment base register and store the 64-bit result in "dst". - dst[63:0] := FS_Segment_Base_Register - - - FSGSBASE -
immintrin.h
- General Support -
- - - Read the GS segment base register and store the 32-bit result in "dst". - dst[31:0] := GS_Segment_Base_Register -dst[63:32] := 0 - - - FSGSBASE -
immintrin.h
- General Support -
- - - Read the GS segment base register and store the 64-bit result in "dst". - dst[63:0] := GS_Segment_Base_Register - - - FSGSBASE -
immintrin.h
- General Support -
- - - - Write the unsigned 32-bit integer "a" to the FS segment base register. - -FS_Segment_Base_Register[31:0] := a[31:0] -FS_Segment_Base_Register[63:32] := 0 - - - FSGSBASE -
immintrin.h
- General Support -
- - - - Write the unsigned 64-bit integer "a" to the FS segment base register. - -FS_Segment_Base_Register[63:0] := a[63:0] - - - FSGSBASE -
immintrin.h
- General Support -
- - - - Write the unsigned 32-bit integer "a" to the GS segment base register. - -GS_Segment_Base_Register[31:0] := a[31:0] -GS_Segment_Base_Register[63:32] := 0 - - - FSGSBASE -
immintrin.h
- General Support -
- - - - Write the unsigned 64-bit integer "a" to the GS segment base register. - -GS_Segment_Base_Register[63:0] := a[63:0] - - - FSGSBASE -
immintrin.h
- General Support -
- - - - - - Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. - state_x87_fpu_mmx_sse := fxrstor(MEM[mem_addr+512*8:mem_addr]) - - - FXSR -
immintrin.h
- OS-Targeted -
- - - - Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at "mem_addr". This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. "mem_addr" must be aligned on a 16-byte boundary. - state_x87_fpu_mmx_sse := fxrstor64(MEM[mem_addr+512*8:mem_addr]) - - - FXSR -
immintrin.h
- OS-Targeted -
- - - - Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. - MEM[mem_addr+512*8:mem_addr] := fxsave(state_x87_fpu_mmx_sse) - - - FXSR -
immintrin.h
- OS-Targeted -
- - - - Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at "mem_addr". The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor. - MEM[mem_addr+512*8:mem_addr] := fxsave64(state_x87_fpu_mmx_sse) - - - FXSR -
immintrin.h
- OS-Targeted -
- - - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 63 - IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) - ELSE - dst.byte[j] := 0 - FI -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 63 - IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) - ELSE - dst.byte[j] := src.byte[j] - FI -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 63 - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[b] - FI - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 7 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR -ENDFOR -dst[MAX:512] := 0 - - - GFNI - AVX512F -
immintrin.h
- Arithmetic -
- - - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 31 - IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) - ELSE - dst.byte[j] := 0 - FI -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 31 - IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) - ELSE - dst.byte[j] := src.byte[j] - FI -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 31 - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 15 - IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) - ELSE - dst.byte[j] := 0 - FI -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst" using writemask "k" (elements are copied from "src"" when the corresponding mask bit is not set). The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 15 - IF k[j] - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) - ELSE - dst.byte[j] := src.byte[j] - FI -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - Multiply the packed 8-bit integers in "a" and "b" in the finite field GF(2^8), and store the results in "dst". The field GF(2^8) is represented in polynomial representation with the reduction polynomial x^8 + x^4 + x^3 + x + 1. - -DEFINE gf2p8mul_byte(src1byte, src2byte) { - tword := 0 - FOR i := 0 to 7 - IF src2byte.bit[i] - tword := tword XOR (src1byte << i) - FI - ENDFOR - FOR i := 14 downto 8 - p := 0x11B << (i-8) - IF tword.bit[i] - tword := tword XOR p - FI - ENDFOR - RETURN tword.byte[0] -} -FOR j := 0 TO 15 - dst.byte[j] := gf2p8mul_byte(a.byte[j], b.byte[j]) -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute an affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. Store the packed 8-bit results in "dst". - -DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND src1byte) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 3 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR -ENDFOR -dst[MAX:256] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using zeromask "k" (elements are zeroed out when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := 0 - FI - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst" using writemask "k" (elements are copied from "src" when the corresponding mask bit is not set). - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - IF k[j*8+i] - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ELSE - dst.qword[j].byte[i] := src.qword[j].byte[i] - FI - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Compute an inverse affine transformation in the Galois Field 2^8. An affine transformation is defined by "A" * "x" + "b", where "A" represents an 8 by 8 bit matrix, "x" represents an 8-bit vector, and "b" is a constant immediate byte. The inverse of the 8-bit values in "x" is defined with respect to the reduction polynomial x^8 + x^4 + x^3 + x + 1. Store the packed 8-bit results in "dst". - DEFINE parity(x) { - t := 0 - FOR i := 0 to 7 - t := t XOR x.bit[i] - ENDFOR - RETURN t -} -DEFINE affine_inverse_byte(tsrc2qw, src1byte, imm8) { - FOR i := 0 to 7 - retbyte.bit[i] := parity(tsrc2qw.byte[7-i] AND inverse(src1byte)) XOR imm8.bit[i] - ENDFOR - RETURN retbyte -} -FOR j := 0 TO 1 - FOR i := 0 to 7 - dst.qword[j].byte[i] := affine_inverse_byte(A.qword[j], x.qword[j].byte[i], b) - ENDFOR -ENDFOR -dst[MAX:128] := 0 - - - GFNI - AVX512VL -
immintrin.h
- Arithmetic -
- - - - - - Provides a hint to the processor to selectively reset the prediction history of the current logical processor specified by a signed 32-bit integer "__eax". - - HRESET -
immintrin.h
- General Support -
- - - - - - Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by "descriptor" based on the invalidation type specified in "type". - The PCID "descriptor" is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved. - The types supported are: - 0) Individual-address invalidation: If "type" is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in "descriptor", except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs. - 1) Single-context invalidation: If "type" is 1, the logical processor invalidates all mappings tagged with the PCID specified in "descriptor" except global translations. In some cases, it may invalidate mappings for other PCIDs as well. - 2) All-context invalidation: If "type" is 2, the logical processor invalidates all mappings tagged with any PCID. - 3) All-context invalidation, retaining global translations: If "type" is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring "descriptor". The instruction may also invalidate global translations as well. - -CASE type[1:0] OF -0: // individual-address invalidation retaining global translations - OP_PCID := MEM[descriptor+11:descriptor] - ADDR := MEM[descriptor+127:descriptor+64] - BREAK -1: // single PCID invalidation retaining globals - OP_PCID := MEM[descriptor+11:descriptor] - // invalidate all mappings tagged with OP_PCID except global translations - BREAK -2: // all PCID invalidation - // invalidate all mappings tagged with any PCID - BREAK -3: // all PCID invalidation retaining global translations - // invalidate all mappings tagged with any PCID except global translations - BREAK -ESAC - - - INVPCID -
immintrin.h
- OS-Targeted -
- - - - Flag - - - - - Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], __h[383:0]) -dst := ZF - - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - Decrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], __h[511:0]) -dst := ZF - - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. - MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], __h[383:0]) -dst := ZF - - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - Encrypt 10 rounds of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], __h[511:0]) -dst := ZF - - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - Wrap a 128-bit AES key from "__key" into a 384-bit key __h stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The explicit source operand "__htype" specifies __h restrictions. - __h[383:0] := WrapKey128(__key[127:0], __htype) -dst[0] := IWKey.NoBackup -dst[4:1] := IWKey.KeySource[3:0] - - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - - Wrap a 256-bit AES key from "__key_hi" and "__key_lo" into a 512-bit key stored in "__h" and set IWKey's NoBackup and KeySource bits in "dst". The 32-bit "__htype" specifies __h restrictions. - __h[511:0] := WrapKey256(__key_lo[127:0], __key_hi[127:0], __htype) -dst[0] := IWKey.NoBackup -dst[4:1] := IWKey.KeySource[3:0] - - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - - Load internal wrapping key (IWKey). The 32-bit unsigned integer "__ctl" specifies IWKey's KeySource and whether backing up the key is permitted. IWKey's 256-bit encryption key is loaded from "__enkey_lo" and "__enkey_hi". IWKey's 128-bit integrity key is loaded from "__intkey". - - KEYLOCKER -
immintrin.h
- Cryptography -
- - Flag - - - - - Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - FOR i := 0 to 7 - __odata[i] := AES128Decrypt (__idata[i], __h[383:0]) -ENDFOR -dst := ZF - - - KEYLOCKER_WIDE -
immintrin.h
- Cryptography -
- - Flag - - - - - Decrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - FOR i := 0 to 7 - __odata[i] := AES256Decrypt (__idata[i], __h[511:0]) -ENDFOR -dst := ZF - - - KEYLOCKER_WIDE -
immintrin.h
- Cryptography -
- - Flag - - - - - Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 128-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - FOR i := 0 to 7 - __odata[i] := AES128Encrypt (__idata[i], __h[383:0]) -ENDFOR -dst := ZF - - - KEYLOCKER_WIDE -
immintrin.h
- Cryptography -
- - Flag - - - - - Encrypt 10 rounds of 8 groups of unsigned 8-bit integers in "__idata" using 256-bit AES key specified in "__h", store the resulting unsigned 8-bit integers into the corresponding elements of "__odata", and set "dst" to the ZF flag status. If exception happens, set ZF flag to 1 and zero initialize "__odata". - FOR i := 0 to 7 - __odata[i] := AES256Encrypt (__idata[i], __h[512:0]) -ENDFOR -dst := ZF - - - KEYLOCKER_WIDE -
immintrin.h
- Cryptography -
- - - - - Count the number of leading zero bits in unsigned 32-bit integer "a", and return that count in "dst". - -tmp := 31 -dst := 0 -DO WHILE (tmp >= 0 AND a[tmp] == 0) - tmp := tmp - 1 - dst := dst + 1 -OD - - - LZCNT -
immintrin.h
- Bit Manipulation -
- - - - Count the number of leading zero bits in unsigned 64-bit integer "a", and return that count in "dst". - -tmp := 63 -dst := 0 -DO WHILE (tmp >= 0 AND a[tmp] == 0) - tmp := tmp - 1 - dst := dst + 1 -OD - - - LZCNT -
immintrin.h
- Bit Manipulation -
- - - - - - Copy 64-bit integer "a" to "dst". - -dst[63:0] := a[63:0] - - - MMX -
mmintrin.h
- Convert -
- - - - Copy 64-bit integer "a" to "dst". - -dst[63:0] := a[63:0] - - - MMX -
mmintrin.h
- Convert -
- - - - Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". - -dst[31:0] := a[31:0] -dst[63:32] := 0 - - - MMX -
mmintrin.h
- Convert -
- - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - - MMX -
mmintrin.h
- Convert -
- - - - Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper element of "dst". - -dst[31:0] := a[31:0] -dst[63:32] := 0 - - - MMX -
mmintrin.h
- Convert -
- - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - - MMX -
mmintrin.h
- Convert -
- - - - Copy 64-bit integer "a" to "dst". - -dst[63:0] := a[63:0] - - - MMX -
mmintrin.h
- Convert -
- - - - Copy 64-bit integer "a" to "dst". - -dst[63:0] := a[63:0] - - - MMX -
mmintrin.h
- Convert -
- - - - Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. - - MMX -
mmintrin.h
- General Support -
- - - - Empty the MMX state, which marks the x87 FPU registers as available for use by x87 instructions. This instruction must be used at the end of all MMX technology procedures. - - MMX -
mmintrin.h
- General Support -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(b[15:0]) -dst[47:40] := Saturate8(b[31:16]) -dst[55:48] := Saturate8(b[47:32]) -dst[63:56] := Saturate8(b[63:48]) - - - MMX -
mmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(b[31:0]) -dst[63:48] := Saturate16(b[63:32]) - - - MMX -
mmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(b[15:0]) -dst[47:40] := SaturateU8(b[31:16]) -dst[55:48] := SaturateU8(b[47:32]) -dst[63:56] := SaturateU8(b[63:48]) - - - MMX -
mmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(b[15:0]) -dst[47:40] := Saturate8(b[31:16]) -dst[55:48] := Saturate8(b[47:32]) -dst[63:56] := Saturate8(b[63:48]) - - - MMX -
mmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(b[31:0]) -dst[63:48] := Saturate16(b[63:32]) - - - MMX -
mmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(b[15:0]) -dst[47:40] := SaturateU8(b[31:16]) -dst[55:48] := SaturateU8(b[47:32]) -dst[63:56] := SaturateU8(b[63:48]) - - - MMX -
mmintrin.h
- Miscellaneous -
- - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[39:32] - dst[15:8] := src2[39:32] - dst[23:16] := src1[47:40] - dst[31:24] := src2[47:40] - dst[39:32] := src1[55:48] - dst[47:40] := src2[55:48] - dst[55:48] := src1[63:56] - dst[63:56] := src2[63:56] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[47:32] - dst[31:16] := src2[47:32] - dst[47:32] := src1[63:48] - dst[63:48] := src2[63:48] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := b[63:32] - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := b[31:0] - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[39:32] - dst[15:8] := src2[39:32] - dst[23:16] := src1[47:40] - dst[31:24] := src2[47:40] - dst[39:32] := src1[55:48] - dst[47:40] := src2[55:48] - dst[55:48] := src1[63:56] - dst[63:56] := src2[63:56] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_BYTES(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[47:32] - dst[31:16] := src2[47:32] - dst[47:32] := src1[63:48] - dst[63:48] := src2[63:48] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_HIGH_WORDS(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := b[63:32] - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[63:0], src2[63:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_BYTES(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[63:0], src2[63:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - RETURN dst[63:0] -} -dst[63:0] := INTERLEAVE_WORDS(a[63:0], b[63:0]) - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := b[31:0] - - - MMX -
mmintrin.h
- Swizzle -
- - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[15:0] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[15:0] -ENDFOR - - - MMX -
mmintrin.h
- Arithmetic -
- - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". - -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". - -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". - -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". - -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" left by "count" while shifting in zeros, and store the result in "dst". - -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << count[63:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" left by "imm8" while shifting in zeros, and store the result in "dst". - -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] << imm8[7:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" right by "count" while shifting in zeros, and store the result in "dst". - -IF count[63:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> count[63:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Shift 64-bit integer "a" right by "imm8" while shifting in zeros, and store the result in "dst". - -IF imm8[7:0] > 63 - dst[63:0] := 0 -ELSE - dst[63:0] := ZeroExtend64(a[63:0] >> imm8[7:0]) -FI - - - MMX -
mmintrin.h
- Shift -
- - - - - Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[63:0] := (a[63:0] AND b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[63:0] := ((NOT a[63:0]) AND b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[63:0] := (a[63:0] OR b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[63:0] := (a[63:0] XOR b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise AND of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[63:0] := (a[63:0] AND b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise NOT of 64 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[63:0] := ((NOT a[63:0]) AND b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise OR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[63:0] := (a[63:0] OR b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compute the bitwise XOR of 64 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[63:0] := (a[63:0] XOR b[63:0]) - - - MMX -
mmintrin.h
- Logical -
- - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - MMX -
mmintrin.h
- Compare -
- - - - Return vector of type __m64 with all elements set to zero. - -dst[MAX:0] := 0 - - - MMX -
mmintrin.h
- Set -
- - - - - Set packed 32-bit integers in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 - - MMX -
mmintrin.h
- Set -
- - - - - - - Set packed 16-bit integers in "dst" with the supplied values. - -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 - - MMX -
mmintrin.h
- Set -
- - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. - -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 - - MMX -
mmintrin.h
- Set -
- - - - Broadcast 32-bit integer "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR - - MMX -
mmintrin.h
- Set -
- - - - Broadcast 16-bit integer "a" to all all elements of "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR - - MMX -
mmintrin.h
- Set -
- - - - Broadcast 8-bit integer "a" to all elements of "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR - - MMX -
mmintrin.h
- Set -
- - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. - -dst[31:0] := e1 -dst[63:32] := e0 - - MMX -
mmintrin.h
- Set -
- - - - - - - Set packed 16-bit integers in "dst" with the supplied values in reverse order. - -dst[15:0] := e3 -dst[31:16] := e2 -dst[47:32] := e1 -dst[63:48] := e0 - - MMX -
mmintrin.h
- Set -
- - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values in reverse order. - -dst[7:0] := e7 -dst[15:8] := e6 -dst[23:16] := e5 -dst[31:24] := e4 -dst[39:32] := e3 -dst[47:40] := e2 -dst[55:48] := e1 -dst[63:56] := e0 - - MMX -
mmintrin.h
- Set -
- - - - - - - - Arm address monitoring hardware using the address specified in "p". A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in "extensions", and optional hints in "hints". - - MONITOR -
pmmintrin.h
- General Support -
- - - - - Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR. - - MONITOR -
pmmintrin.h
- General Support -
- - - - - - Load 16 bits from memory, perform a byte swap operation, and store the result in "dst". - -FOR j := 0 to 1 - i := j*8 - dst[i+7:i] := MEM[ptr+15-i:ptr+8-i] -ENDFOR - - - MOVBE -
immintrin.h
- Load -
- - - - Load 32 bits from memory, perform a byte swap operation, and store the result in "dst". - -FOR j := 0 to 3 - i := j*8 - dst[i+7:i] := MEM[ptr+31-i:ptr+24-i] -ENDFOR - - - MOVBE -
immintrin.h
- Load -
- - - - Load 64 bits from memory, perform a byte swap operation, and store the result in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MEM[ptr+63-i:ptr+56-i] -ENDFOR - - - MOVBE -
immintrin.h
- Load -
- - - - - Perform a bit swap operation of the 16 bits in "data", and store the results to memory. - -FOR j := 0 to 1 - i := j*8 - MEM[ptr+i+7:ptr+i] := data[15-i:8-i] -ENDFOR - - - MOVBE -
immintrin.h
- Store -
- - - - - Perform a bit swap operation of the 32 bits in "data", and store the results to memory. - -addr := MEM[ptr] -FOR j := 0 to 3 - i := j*8 - MEM[ptr+i+7:ptr+i] := data[31-i:24-i] -ENDFOR - - - MOVBE -
immintrin.h
- Store -
- - - - - Perform a bit swap operation of the 64 bits in "data", and store the results to memory. - -addr := MEM[ptr] -FOR j := 0 to 7 - i := j*8 - MEM[ptr+i+7:ptr+i] := data[63-i:56-i] -ENDFOR - - - MOVBE -
immintrin.h
- Store -
- - - - - - - Move 64-byte (512-bit) value using direct store from source memory address "src" to destination memory address "dst". - -MEM[dst+511:dst] := MEM[src+511:src] - - - MOVDIR64B -
immintrin.h
- Store -
- - - - - - - Store 64-bit integer from "val" into memory using direct store. - -MEM[dst+63:dst] := val[63:0] - - - MOVDIRI -
immintrin.h
- Store -
- - - - - Store 32-bit integer from "val" into memory using direct store. - -MEM[dst+31:dst] := val[31:0] - - - MOVDIRI -
immintrin.h
- Store -
- - - - - - - Make a pointer with the value of "srcmem" and bounds set to ["srcmem", "srcmem" + "size" - 1], and store the result in "dst". - dst := srcmem -dst.LB := srcmem.LB -dst.UB := srcmem + size - 1 - - - MPX -
immintrin.h
- Miscellaneous - -
- - - - - - Narrow the bounds for pointer "q" to the intersection of the bounds of "r" and the bounds ["q", "q" + "size" - 1], and store the result in "dst". - dst := q -IF r.LB > (q + size - 1) OR r.UB < q - dst.LB := 1 - dst.UB := 0 -ELSE - dst.LB := MAX(r.LB, q) - dst.UB := MIN(r.UB, (q + size - 1)) -FI - - MPX -
immintrin.h
- Miscellaneous - -
- - - - - Make a pointer with the value of "q" and bounds set to the bounds of "r" (e.g. copy the bounds of "r" to pointer "q"), and store the result in "dst". - dst := q -dst.LB := r.LB -dst.UB := r.UB - - MPX -
immintrin.h
- Miscellaneous - -
- - - - Make a pointer with the value of "q" and open bounds, which allow the pointer to access the entire virtual address space, and store the result in "dst". - dst := q -dst.LB := 0 -dst.UB := 0 - - MPX -
immintrin.h
- Miscellaneous - -
- - - - - Stores the bounds of "ptr_val" pointer in memory at address "ptr_addr". - MEM[ptr_addr].LB := ptr_val.LB -MEM[ptr_addr].UB := ptr_val.UB - - - MPX -
immintrin.h
- Miscellaneous - -
- - - - Checks if "q" is within its lower bound, and throws a #BR if not. - IF q < q.LB - #BR -FI - - - MPX -
immintrin.h
- Miscellaneous - -
- - - - Checks if "q" is within its upper bound, and throws a #BR if not. - IF q > q.UB - #BR -FI - - - - MPX -
immintrin.h
- Miscellaneous - -
- - - - - Checks if ["q", "q" + "size" - 1] is within the lower and upper bounds of "q" and throws a #BR if not. - IF (q + size - 1) < q.LB OR (q + size - 1) > q.UB - #BR -FI - - - - MPX -
immintrin.h
- Miscellaneous - -
- - - - Return the lower bound of "q". - dst := q.LB - - MPX -
immintrin.h
- Miscellaneous - -
- - - - Return the upper bound of "q". - dst := q.UB - - MPX -
immintrin.h
- Miscellaneous - -
- - - - - Set "dst" to the index of the lowest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. - -tmp := 0 -IF a == 0 - // dst is undefined -ELSE - DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - OD -FI -dst := tmp - - -
immintrin.h
- Bit Manipulation -
- - - - Set "dst" to the index of the highest set bit in 32-bit integer "a". If no bits are set in "a" then "dst" is undefined. - -tmp := 31 -IF a == 0 - // dst is undefined -ELSE - DO WHILE ((tmp > 0) AND a[tmp] == 0) - tmp := tmp - 1 - OD -FI -dst := tmp - - -
immintrin.h
- Bit Manipulation -
- - - - - Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. - -tmp := 0 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp < 32) AND a[tmp] == 0) - tmp := tmp + 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 31) ? 0 : 1 -FI - - -
immintrin.h
- Bit Manipulation -
- - - - - Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. - -tmp := 31 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp > 0) AND a[tmp] == 0) - tmp := tmp - 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 0) ? 0 : 1 -FI - - -
immintrin.h
- Bit Manipulation -
- - - - - Set "index" to the index of the lowest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. - -tmp := 0 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp < 64) AND a[tmp] == 0) - tmp := tmp + 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 63) ? 0 : 1 -FI - - -
immintrin.h
- Bit Manipulation -
- - - - - Set "index" to the index of the highest set bit in 32-bit integer "mask". If no bits are set in "a", then "index" is undefined and "dst" is set to 0, otherwise "dst" is set to 1. - -tmp := 63 -IF a == 0 - // MEM[index+31:index] is undefined - dst := 0 -ELSE - DO WHILE ((tmp > 0) AND a[tmp] == 0) - tmp := tmp - 1 - OD - MEM[index+31:index] := tmp - dst := (tmp == 0) ? 0 : 1 -FI - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 32-bit integer "a". - -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 32-bit integer "a", and set that bit to its complement. - -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] -MEM[addr] := ~dst[0] - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 32-bit integer "a", and set that bit to zero. - -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] -MEM[addr] := 0 - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 32-bit integer "a", and set that bit to one. - -addr := a + ZeroExtend64(b) -dst[0] := MEM[addr] -MEM[addr] := 1 - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 64-bit integer "a". - -addr := a + b -dst[0] := MEM[addr] - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 64-bit integer "a", and set that bit to its complement. - -addr := a + b -dst[0] := MEM[addr] -MEM[addr] := ~dst[0] - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 64-bit integer "a", and set that bit to zero. - -addr := a + b -dst[0] := MEM[addr] -MEM[addr] := 0 - - -
immintrin.h
- Bit Manipulation -
- - - - - Return the bit at index "b" of 64-bit integer "a", and set that bit to one. - -addr := a + b -dst[0] := MEM[addr] -MEM[addr] := 1 - - -
immintrin.h
- Bit Manipulation -
- - - - Reverse the byte order of 32-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. - -dst[7:0] := a[31:24] -dst[15:8] := a[23:16] -dst[23:16] := a[15:8] -dst[31:24] := a[7:0] - - -
immintrin.h
- Bit Manipulation -
- - - - Reverse the byte order of 64-bit integer "a", and store the result in "dst". This intrinsic is provided for conversion between little and big endian values. - -dst[7:0] := a[63:56] -dst[15:8] := a[55:48] -dst[23:16] := a[47:40] -dst[31:24] := a[39:32] -dst[39:32] := a[31:24] -dst[47:40] := a[23:16] -dst[55:48] := a[15:8] -dst[63:56] := a[7:0] - - -
immintrin.h
- Bit Manipulation -
- - - - Cast from type float to type unsigned __int32 without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
- Cast -
- - - - Cast from type double to type unsigned __int64 without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
- Cast -
- - - - Cast from type unsigned __int32 to type float without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
- Cast -
- - - - Cast from type unsigned __int64 to type double without conversion. - This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. -
immintrin.h
- Cast -
- - - - - Shift the bits of unsigned long integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". - // size := 32 or 64 -dst := a -count := shift AND (size - 1) -DO WHILE (count > 0) - tmp[0] := dst[size - 1] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD - - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned long integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". - // size := 32 or 64 -dst := a -count := shift AND (size - 1) -DO WHILE (count > 0) - tmp[size - 1] := dst[0] - dst := (dst >> 1) OR tmp[size - 1] - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned 32-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". - -dst := a -count := shift AND 31 -DO WHILE (count > 0) - tmp[0] := dst[31] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned 32-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". - -dst := a -count := shift AND 31 -DO WHILE (count > 0) - tmp[31] := dst[0] - dst := (dst >> 1) OR tmp - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned 16-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". - -dst := a -count := shift AND 15 -DO WHILE (count > 0) - tmp[0] := dst[15] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned 16-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". - -dst := a -count := shift AND 15 -DO WHILE (count > 0) - tmp[15] := dst[0] - dst := (dst >> 1) OR tmp - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned 64-bit integer "a" left by the number of bits specified in "shift", rotating the most-significant bit to the least-significant bit location, and store the unsigned result in "dst". - -dst := a -count := shift AND 63 -DO WHILE (count > 0) - tmp[0] := dst[63] - dst := (dst << 1) OR tmp[0] - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - - Shift the bits of unsigned 64-bit integer "a" right by the number of bits specified in "shift", rotating the least-significant bit to the most-significant bit location, and store the unsigned result in "dst". - -dst := a -count := shift AND 63 -DO WHILE (count > 0) - tmp[63] := dst[0] - dst := (dst >> 1) OR tmp[63] - count := count - 1 -OD - - -
immintrin.h
- Shift -
- - - - Treat the processor-specific feature(s) specified in "a" as available. Multiple features may be OR'd together. See the valid feature flags below: - -_FEATURE_GENERIC_IA32 -_FEATURE_FPU -_FEATURE_CMOV -_FEATURE_MMX -_FEATURE_FXSAVE -_FEATURE_SSE -_FEATURE_SSE2 -_FEATURE_SSE3 -_FEATURE_SSSE3 -_FEATURE_SSE4_1 -_FEATURE_SSE4_2 -_FEATURE_MOVBE -_FEATURE_POPCNT -_FEATURE_PCLMULQDQ -_FEATURE_AES -_FEATURE_F16C -_FEATURE_AVX -_FEATURE_RDRND -_FEATURE_FMA -_FEATURE_BMI -_FEATURE_LZCNT -_FEATURE_HLE -_FEATURE_RTM -_FEATURE_AVX2 -_FEATURE_KNCNI -_FEATURE_AVX512F -_FEATURE_ADX -_FEATURE_RDSEED -_FEATURE_AVX512ER -_FEATURE_AVX512PF -_FEATURE_AVX512CD -_FEATURE_SHA -_FEATURE_MPX -_FEATURE_AVX512BW -_FEATURE_AVX512VL -_FEATURE_AVX512VBMI -_FEATURE_AVX512_4FMAPS -_FEATURE_AVX512_4VNNIW -_FEATURE_AVX512_VPOPCNTDQ -_FEATURE_AVX512_BITALG -_FEATURE_AVX512_VBMI2 -_FEATURE_GFNI -_FEATURE_VAES -_FEATURE_VPCLMULQDQ -_FEATURE_AVX512_VNNI -_FEATURE_CLWB -_FEATURE_RDPID -_FEATURE_IBT -_FEATURE_SHSTK -_FEATURE_SGX -_FEATURE_WBNOINVD -_FEATURE_PCONFIG -_FEATURE_AXV512_4VNNIB -_FEATURE_AXV512_4FMAPH -_FEATURE_AXV512_BITALG2 -_FEATURE_AXV512_VP2INTERSECT - -
immintrin.h
- General Support -
- - - - Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This function is limited to bitmask values in the first 'page' of the libirc cpu-id information. This intrinsic does not check the processor vendor. See the valid feature flags below: - -_FEATURE_GENERIC_IA32 -_FEATURE_FPU -_FEATURE_CMOV -_FEATURE_MMX -_FEATURE_FXSAVE -_FEATURE_SSE -_FEATURE_SSE2 -_FEATURE_SSE3 -_FEATURE_SSSE3 -_FEATURE_SSE4_1 -_FEATURE_SSE4_2 -_FEATURE_MOVBE -_FEATURE_POPCNT -_FEATURE_PCLMULQDQ -_FEATURE_AES -_FEATURE_F16C -_FEATURE_AVX -_FEATURE_RDRND -_FEATURE_FMA -_FEATURE_BMI -_FEATURE_LZCNT -_FEATURE_HLE -_FEATURE_RTM -_FEATURE_AVX2 -_FEATURE_KNCNI -_FEATURE_AVX512F -_FEATURE_ADX -_FEATURE_RDSEED -_FEATURE_AVX512ER -_FEATURE_AVX512PF -_FEATURE_AVX512CD -_FEATURE_SHA -_FEATURE_MPX -_FEATURE_AVX512BW -_FEATURE_AVX512VL -_FEATURE_AVX512VBMI -_FEATURE_AVX512_4FMAPS -_FEATURE_AVX512_4VNNIW -_FEATURE_AVX512_VPOPCNTDQ -_FEATURE_AVX512_BITALG -_FEATURE_AVX512_VBMI2 -_FEATURE_GFNI -_FEATURE_VAES -_FEATURE_VPCLMULQDQ -_FEATURE_AVX512_VNNI -_FEATURE_CLWB -_FEATURE_RDPID -_FEATURE_IBT -_FEATURE_SHSTK -_FEATURE_SGX -_FEATURE_WBNOINVD -_FEATURE_PCONFIG -_FEATURE_AXV512_4VNNIB -_FEATURE_AXV512_4FMAPH -_FEATURE_AXV512_BITALG2 -_FEATURE_AXV512_VP2INTERSECT -_FEATURE_AXV512_FP16 - -
immintrin.h
- General Support -
- - - - - Dynamically query the processor to determine if the processor-specific feature(s) specified in "a" are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This works identically to the previous variant, except it also accepts a 'page' index that permits checking features on the 2nd page of the libirc information. When provided with a '0' in the 'page' parameter, this works identically to _may_i_use_cpu_feature. This intrinsic does not check the processor vendor. See the valid feature flags on the 2nd page below: (provided with a '1' in the 'page' parameter) - -_FEATURE_CLDEMOTE -_FEATURE_MOVDIRI -_FEATURE_MOVDIR64B -_FEATURE_WAITPKG -_FEATURE_AVX512_Bf16 -_FEATURE_ENQCMD -_FEATURE_AVX_VNNI -_FEATURE_AMX_TILE -_FEATURE_AMX_INT8 -_FEATURE_AMX_BF16 -_FEATURE_KL -_FEATURE_WIDE_KL -_FEATURE_HRESET -_FEATURE_UINTR -_FEATURE_PREFETCHI -_FEATURE_AVXVNNIINT8 -_FEATURE_CMPCCXADD -_FEATURE_AVXIFMA -_FEATURE_AVXNECONVERT -_FEATURE_RAOINT -_FEATURE_AMX_FP16 -_FEATURE_AMX_COMPLEX -_FEATURE_SHA512 -_FEATURE_SM3 -_FEATURE_SM4 -_FEATURE_AVXVNNIINT16 -_FEATURE_USERMSR -_FEATURE_AVX10_1_256 -_FEATURE_AVX10_1_512 -_FEATURE_APXF -_FEATURE_MSRLIST -_FEATURE_WRMSRNS -_FEATURE_PBNDKB - -
immintrin.h
- General Support -
- - - - Dynamically query the processor to determine if the processor-specific feature(s) specified a series of compile-time string literals in "feature, ..." are available, and return true or false (1 or 0) if the set of features is available. These feature names are converted to a bitmask and uses the same infrastructure as _may_i_use_cpu_feature_ext to validate it. The behavior is the same as the previous variants. This intrinsic does not check the processor vendor. Supported string literals are one-to-one corresponding in the "Operation" sections of _may_i_use_cpu_feature and _may_i_use_cpu_feature_ext. Example string literals are "avx2", "bmi", "avx512fp16", "amx-int8"... - - -
immintrin.h
- General Support -
- - - - Read the Performance Monitor Counter (PMC) specified by "a", and store up to 64-bits in "dst". The width of performance counters is implementation specific. - dst[63:0] := ReadPMC(a) - - -
immintrin.h
- General Support -
- - - - - - - Add unsigned 32-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). - -tmp[32:0] := a[31:0] + b[31:0] + (c_in > 0 ? 1 : 0) -MEM[out+31:out] := tmp[31:0] -dst[0] := tmp[32] -dst[7:1] := 0 - - -
immintrin.h
- Arithmetic -
- - - - - - - Add unsigned 64-bit integers "a" and "b" with unsigned 8-bit carry-in "c_in" (carry flag), and store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). - -tmp[64:0] := a[63:0] + b[63:0] + (c_in > 0 ? 1 : 0) -MEM[out+63:out] := tmp[63:0] -dst[0] := tmp[64] -dst[7:1] := 0 - - -
immintrin.h
- Arithmetic -
- - - - - - - Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 32-bit integer "b", and subtract the result from unsigned 32-bit integer "a". Store the unsigned 32-bit result in "out", and the carry-out in "dst" (carry or overflow flag). - -tmp[32:0] := a[31:0] - (b[31:0] + (c_in > 0 ? 1 : 0)) -MEM[out+31:out] := tmp[31:0] -dst[0] := tmp[32] -dst[7:1] := 0 - - -
immintrin.h
- Arithmetic -
- - - - - - - Add unsigned 8-bit borrow "c_in" (carry flag) to unsigned 64-bit integer "b", and subtract the result from unsigned 64-bit integer "a". Store the unsigned 64-bit result in "out", and the carry-out in "dst" (carry or overflow flag). - -tmp[64:0] := a[63:0] - (b[63:0] + (c_in > 0 ? 1 : 0)) -MEM[out+63:out] := tmp[63:0] -dst[0] := tmp[64] -dst[7:1] := 0 - - -
immintrin.h
- Arithmetic -
- - - - Insert the 32-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. - -
immintrin.h
- Miscellaneous -
- - - - Insert the 64-bit data from "a" into a Processor Trace stream via a PTW packet. The PTW packet will be inserted if tracing is currently enabled and ptwrite is currently enabled. The current IP will also be inserted via a FUP packet if FUPonPTW is enabled. - -
immintrin.h
- Miscellaneous -
- - - - - Invoke the Intel SGX enclave user (non-privilege) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. - -
immintrin.h
- Miscellaneous -
- - - - - Invoke the Intel SGX enclave system (privileged) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. - -
immintrin.h
- Miscellaneous -
- - - - - Invoke the Intel SGX enclave virtualized (VMM) leaf function specified by "a", and return the error code. The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. - -
immintrin.h
- Miscellaneous -
- - - - Write back and flush internal caches. - Initiate writing-back and flushing of external - caches. - -
immintrin.h
- Miscellaneous -
- - - - Convert the half-precision (16-bit) floating-point value "a" to a single-precision (32-bit) floating-point value, and store the result in "dst". - -dst[31:0] := Convert_FP16_To_FP32(a[15:0]) - -
emmintrin.h
- Convert -
- - - - - Convert the single-precision (32-bit) floating-point value "a" to a half-precision (16-bit) floating-point value, and store the result in "dst". - [round_note] - -dst[15:0] := Convert_FP32_To_FP16(a[31:0]) - -
emmintrin.h
- Convert -
- - - - - - - Perform a carry-less multiplication of two 64-bit integers, selected from "a" and "b" according to "imm8", and store the results in "dst". - -IF (imm8[0] == 0) - TEMP1 := a[63:0] -ELSE - TEMP1 := a[127:64] -FI -IF (imm8[4] == 0) - TEMP2 := b[63:0] -ELSE - TEMP2 := b[127:64] -FI -FOR i := 0 to 63 - TEMP[i] := (TEMP1[0] and TEMP2[i]) - FOR j := 1 to i - TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) - ENDFOR - dst[i] := TEMP[i] -ENDFOR -FOR i := 64 to 127 - TEMP[i] := 0 - FOR j := (i - 63) to 63 - TEMP[i] := TEMP[i] XOR (TEMP1[j] AND TEMP2[i-j]) - ENDFOR - dst[i] := TEMP[i] -ENDFOR -dst[127] := 0 - - - PCLMULQDQ -
wmmintrin.h
- Application-Targeted -
- - - - - - - Invoke the PCONFIG leaf function specified by "a". The "__data" array contains 3 32- or 64-bit elements that may act as input, output, or be unused, depending on the semantics of the specified leaf function; these correspond to ebx, ecx, and edx. May return the value in eax, depending on the semantics of the specified leaf function. - - PCONFIG -
immintrin.h
- Miscellaneous -
- - - - - - Count the number of bits set to 1 in unsigned 32-bit integer "a", and return that count in "dst". - -dst := 0 -FOR i := 0 to 31 - IF a[i] - dst := dst + 1 - FI -ENDFOR - - - POPCNT -
immintrin.h
- Bit Manipulation -
- - - - Count the number of bits set to 1 in unsigned 64-bit integer "a", and return that count in "dst". - -dst := 0 -FOR i := 0 to 63 - IF a[i] - dst := dst + 1 - FI -ENDFOR - - - POPCNT -
immintrin.h
- Bit Manipulation -
- - - - Count the number of bits set to 1 in 32-bit integer "a", and return that count in "dst". - -dst := 0 -FOR i := 0 to 31 - IF a[i] - dst := dst + 1 - FI -ENDFOR - - - POPCNT -
immintrin.h
- Bit Manipulation -
- - - - Count the number of bits set to 1 in 64-bit integer "a", and return that count in "dst". - -dst := 0 -FOR i := 0 to 63 - IF a[i] - dst := dst + 1 - FI -ENDFOR - - - POPCNT -
immintrin.h
- Bit Manipulation -
- - - - - - Loads an instruction sequence containing the specified memory address into all level cache. - - PREFETCHI -
x86gprintrin.h
- General Support -
- - - - Loads an instruction sequence containing the specified memory address into all but the first-level cache. - - PREFETCHI -
x86gprintrin.h
- General Support -
- - - - - Fetch the line of data from memory that contains address "p" to a location in the cache hierarchy specified by the locality hint "i", which can be one of:<ul> - <li>_MM_HINT_ET0 // 7, move data using the ET0 hint. The PREFETCHW instruction will be generated.</li> - <li>_MM_HINT_T0 // 3, move data using the T0 hint. The PREFETCHT0 instruction will be generated.</li> - <li>_MM_HINT_T1 // 2, move data using the T1 hint. The PREFETCHT1 instruction will be generated.</li> - <li>_MM_HINT_T2 // 1, move data using the T2 hint. The PREFETCHT2 instruction will be generated.</li> - <li>_MM_HINT_NTA // 0, move data using the non-temporal access (NTA) hint. The PREFETCHNTA instruction will be generated.</li> - - - - - - - PRFCHW -
immintrin.h
- General Support -
- - - - - Atomically add a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. - - -MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically add a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. - - -MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically and a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. - - -MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically and a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. - - -MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically or a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. - - -MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically or a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. - - -MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically xor a 32-bit value at memory operand "__A" and a 32-bit "__B", and store the result to the same memory location. - - -MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - Atomically xor a 64-bit value at memory operand "__A" and a 64-bit "__B", and store the result to the same memory location. - - -MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0] - - - - RAO_INT -
x86gprintrin.h
- Arithmetic -
- - - - Copy the IA32_TSC_AUX MSR (signature value) into "dst". - dst[31:0] := IA32_TSC_AUX[31:0] - - - RDPID -
immintrin.h
- General Support -
- - - - - - Read a hardware generated 16-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_RND_GEN.ready == 1 - val[15:0] := HW_RND_GEN.data - dst := 1 -ELSE - val[15:0] := 0 - dst := 0 -FI - - - RDRAND -
immintrin.h
- Random -
- - - - Read a hardware generated 32-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_RND_GEN.ready == 1 - val[31:0] := HW_RND_GEN.data - dst := 1 -ELSE - val[31:0] := 0 - dst := 0 -FI - - - RDRAND -
immintrin.h
- Random -
- - - - Read a hardware generated 64-bit random value and store the result in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_RND_GEN.ready == 1 - val[63:0] := HW_RND_GEN.data - dst := 1 -ELSE - val[63:0] := 0 - dst := 0 -FI - - - RDRAND -
immintrin.h
- Random -
- - - - - - Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_NRND_GEN.ready == 1 - val[15:0] := HW_NRND_GEN.data - dst := 1 -ELSE - val[15:0] := 0 - dst := 0 -FI - - - RDSEED -
immintrin.h
- Random -
- - - - Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_NRND_GEN.ready == 1 - val[31:0] := HW_NRND_GEN.data - dst := 1 -ELSE - val[31:0] := 0 - dst := 0 -FI - - - RDSEED -
immintrin.h
- Random -
- - - - Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in "val". Return 1 if a random value was generated, and 0 otherwise. - IF HW_NRND_GEN.ready == 1 - val[63:0] := HW_NRND_GEN.data - dst := 1 -ELSE - val[63:0] := 0 - dst := 0 -FI - - - RDSEED -
immintrin.h
- Random -
- - - - - - Copy the current 64-bit value of the processor's time-stamp counter into "dst", and store the IA32_TSC_AUX MSR (signature value) into memory at "mem_addr". - dst[63:0] := TimeStampCounter -MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0] - - - RDTSCP -
immintrin.h
- General Support -
- - - - - - Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the "imm8" parameter will be provided in bits [31:24] of EAX. - Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction. - IF RTM_ACTIVE == 0 - // nop -ELSE - // restore architectural register state - // discard memory updates performed in transaction - // update EAX with status and imm8 value - eax[31:24] := imm8[7:0] - RTM_NEST_COUNT := 0 - RTM_ACTIVE := 0 - IF _64_BIT_MODE - RIP := fallbackRIP - ELSE - EIP := fallbackEIP - FI -FI - - - RTM -
immintrin.h
- General Support -
- - - - Specify the start of an RTM code region. - If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution. - On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction. Return status of ~0 (0xFFFF) if continuing inside transaction; all other codes are aborts. - IF RTM_NEST_COUNT < MAX_RTM_NEST_COUNT - RTM_NEST_COUNT := RTM_NEST_COUNT + 1 - IF RTM_NEST_COUNT == 1 - IF _64_BIT_MODE - fallbackRIP := RIP - ELSE IF _32_BIT_MODE - fallbackEIP := EIP - FI - - RTM_ACTIVE := 1 - // enter RTM execution, record register state, start tracking memory state - FI -ELSE - // RTM abort (see _xabort) -FI - - - RTM -
immintrin.h
- General Support -
- - - - Specify the end of an RTM code region. - If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically. - If the commit fails, the logical processor will perform an RTM abort. - IF RTM_ACTIVE == 1 - RTM_NEST_COUNT := RTM_NEST_COUNT - 1 - IF RTM_NEST_COUNT == 0 - // try to commit transaction - IF FAIL_TO_COMMIT_TRANSACTION - // RTM abort (see _xabort) - ELSE - RTM_ACTIVE := 0 - FI - FI -FI - - - RTM -
immintrin.h
- General Support -
- - - - Query the transactional execution status, return 1 if inside a transactionally executing RTM or HLE region, and return 0 otherwise. - IF (RTM_ACTIVE == 1 OR HLE_ACTIVE == 1) - dst := 1 -ELSE - dst := 0 -FI - - - RTM -
immintrin.h
- General Support -
- - - - - Serialize instruction execution, ensuring all modifications to flags, registers, and memory by previous instructions are completed before the next instruction is fetched. - - SERIALIZE -
immintrin.h
- General Support -
- - - - - - - Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". - -W0 := a[127:96] -W1 := a[95:64] -W2 := a[63:32] -W3 := a[31:0] -W4 := b[127:96] -W5 := b[95:64] -dst[127:96] := W2 XOR W0 -dst[95:64] := W3 XOR W1 -dst[63:32] := W4 XOR W2 -dst[31:0] := W5 XOR W3 - - - SHA -
immintrin.h
- Cryptography -
- - - - - Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in "a" and the previous message values in "b", and store the result in "dst". - -W13 := b[95:64] -W14 := b[63:32] -W15 := b[31:0] -W16 := (a[127:96] XOR W13) <<< 1 -W17 := (a[95:64] XOR W14) <<< 1 -W18 := (a[63:32] XOR W15) <<< 1 -W19 := (a[31:0] XOR W16) <<< 1 -dst[127:96] := W16 -dst[95:64] := W17 -dst[63:32] := W18 -dst[31:0] := W19 - - - SHA -
immintrin.h
- Cryptography -
- - - - - Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable "a", add that value to the scheduled values (unsigned 32-bit integers) in "b", and store the result in "dst". - -tmp := (a[127:96] <<< 30) -dst[127:96] := b[127:96] + tmp -dst[95:64] := b[95:64] -dst[63:32] := b[63:32] -dst[31:0] := b[31:0] - - - SHA -
immintrin.h
- Cryptography -
- - - - - - Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from "a" and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from "b", and store the updated SHA1 state (A,B,C,D) in "dst". "func" contains the logic functions and round constants. - IF (func[1:0] == 0) - f := f0() - K := K0 -ELSE IF (func[1:0] == 1) - f := f1() - K := K1 -ELSE IF (func[1:0] == 2) - f := f2() - K := K2 -ELSE IF (func[1:0] == 3) - f := f3() - K := K3 -FI -A := a[127:96] -B := a[95:64] -C := a[63:32] -D := a[31:0] -W[0] := b[127:96] -W[1] := b[95:64] -W[2] := b[63:32] -W[3] := b[31:0] -A[1] := f(B, C, D) + (A <<< 5) + W[0] + K -B[1] := A -C[1] := B <<< 30 -D[1] := C -E[1] := D -FOR i := 1 to 3 - A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K - B[i+1] := A[i] - C[i+1] := B[i] <<< 30 - D[i+1] := C[i] - E[i+1] := D[i] -ENDFOR -dst[127:96] := A[4] -dst[95:64] := B[4] -dst[63:32] := C[4] -dst[31:0] := D[4] - - - SHA -
immintrin.h
- Cryptography -
- - - - - Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst". - W4 := b[31:0] -W3 := a[127:96] -W2 := a[95:64] -W1 := a[63:32] -W0 := a[31:0] -dst[127:96] := W3 + sigma0(W4) -dst[95:64] := W2 + sigma0(W3) -dst[63:32] := W1 + sigma0(W2) -dst[31:0] := W0 + sigma0(W1) - - - SHA -
immintrin.h
- Cryptography -
- - - - - Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from "a" and "b", and store the result in "dst"." - W14 := b[95:64] -W15 := b[127:96] -W16 := a[31:0] + sigma1(W14) -W17 := a[63:32] + sigma1(W15) -W18 := a[95:64] + sigma1(W16) -W19 := a[127:96] + sigma1(W17) -dst[127:96] := W19 -dst[95:64] := W18 -dst[63:32] := W17 -dst[31:0] := W16 - - - SHA -
immintrin.h
- Cryptography -
- - - - - - Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from "a", an initial SHA256 state (A,B,E,F) from "b", and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from "k", and store the updated SHA256 state (A,B,E,F) in "dst". - A[0] := b[127:96] -B[0] := b[95:64] -C[0] := a[127:96] -D[0] := a[95:64] -E[0] := b[63:32] -F[0] := b[31:0] -G[0] := a[63:32] -H[0] := a[31:0] -W_K[0] := k[31:0] -W_K[1] := k[63:32] -FOR i := 0 to 1 - A[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]) - B[i+1] := A[i] - C[i+1] := B[i] - D[i+1] := C[i] - E[i+1] := Ch(E[i], F[i], G[i]) + sum1(E[i]) + W_K[i] + H[i] + D[i] - F[i+1] := E[i] - G[i+1] := F[i] - H[i+1] := G[i] -ENDFOR -dst[127:96] := A[2] -dst[95:64] := B[2] -dst[63:32] := E[2] -dst[31:0] := F[2] - - - SHA -
immintrin.h
- Cryptography -
- - - - - This intrinisc is one of the two SHA512 message scheduling instructions. The intrinsic performs an intermediate calculation for the next four SHA512 message qwords. The calculated results are stored in "dst". - - -DEFINE ROR64(qword, n) { - count := n % 64 - dest := (qword >> count) | (qword << (64 - count)) - RETURN dest -} -DEFINE SHR64(qword, n) { - RETURN qword >> n -} -DEFINE s0(qword) { - RETURN ROR64(qword,1) ^ ROR64(qword, 8) ^ SHR64(qword, 7) -} -W.qword[4] := __B.qword[0] -W.qword[3] := __A.qword[3] -W.qword[2] := __A.qword[2] -W.qword[1] := __A.qword[1] -W.qword[0] := __A.qword[0] -dst.qword[3] := W.qword[3] + s0(W.qword[4]) -dst.qword[2] := W.qword[2] + s0(W.qword[3]) -dst.qword[1] := W.qword[1] + s0(W.qword[2]) -dst.qword[0] := W.qword[0] + s0(W.qword[1]) - - - - SHA512 - AVX -
immintrin.h
- Cryptography -
- - - This intrinisc is one of the two SHA512 message scheduling instructions. The intrinsic performs the final calculation for the next four SHA512 message qwords. The calculated results are stored in "dst". - - -DEFINE ROR64(qword, n) { - count := n % 64 - dest := (qword >> count) | (qword << (64 - count)) - RETURN dest -} -DEFINE SHR64(qword, n) { - RETURN qword >> n -} -DEFINE s1(qword) { - RETURN ROR64(qword,19) ^ ROR64(qword, 61) ^ SHR64(qword, 6) -} -W.qword[14] := __B.qword[2] -W.qword[15] := __B.qword[3] -W.qword[16] := __A.qword[0] + s1(W.qword[14]) -W.qword[17] := __A.qword[1] + s1(W.qword[15]) -W.qword[18] := __A.qword[2] + s1(W.qword[16]) -W.qword[19] := __A.qword[3] + s1(W.qword[17]) -dst.qword[3] := W.qword[19] -dst.qword[2] := W.qword[18] -dst.qword[1] := W.qword[17] -dst.qword[0] := W.qword[16] - - - - SHA512 - AVX -
immintrin.h
- Cryptography -
- - - This intrinisc performs two rounds of SHA512 operation using initial SHA512 state (C,D,G,H) from "__A", an initial SHA512 state (A,B,E,F) from "__B", and a pre-computed sum of the next two round message qwords and the corresponding round constants from "__C" (only the two lower qwords of the third operand). The updated SHA512 state (A,B,E,F) is written to "dst", and "dst" can be used as the updated state (C,D,G,H) in later rounds. - - -DEFINE ROR64(qword, n) { - count := n % 64 - dest := (qword >> count) | (qword << (64 - count)) - RETURN dest -} -DEFINE SHR64(qword, n) { - RETURN qword >> n -} -DEFINE cap_sigma0(qword) { - RETURN ROR64(qword, 28) ^ ROR64(qword, 34) ^ ROR64(qword, 39) -} -DEFINE cap_sigma1(qword) { - RETURN ROR64(qword, 14) ^ ROR64(qword, 18) ^ ROR64(qword, 41) -} -DEFINE MAJ(a,b,c) { - RETURN (a & b) ^ (a & c) ^ (b & c) -} -DEFINE CH(a,b,c) { - RETURN (a & b) ^ (c & ~a) -} -A.qword[0] := __B.qword[3] -B.qword[0] := __B.qword[2] -C.qword[0] := __A.qword[3] -D.qword[0] := __A.qword[2] -E.qword[0] := __B.qword[1] -F.qword[0] := __B.qword[0] -G.qword[0] := __A.qword[1] -H.qword[0] := __A.qword[0] -WK.qword[0]:= __C.qword[0] -WK.qword[1]:= __C.qword[1] -FOR i := 0 to 1 - A.qword[i+1] := CH(E.qword[i], F.qword[i], G.qword[i]) + cap_sigma1(E.qword[i]) + WK.qword[i] + H.qword[i] + MAJ(A.qword[i], B.qword[i], C.qword[i]) + cap_sigma0(A.qword[i]) - B.qword[i+1] := A.qword[i] - C.qword[i+1] := B.qword[i] - D.qword[i+1] := C.qword[i] - E.qword[i+1] := CH(E.qword[i], F.qword[i], G.qword[i]) + cap_sigma1(E.qword[i]) + WK.qword[i] + H.qword[i] + D.qword[i] - F.qword[i+1] := E.qword[i] - G.qword[i+1] := F.qword[i] - H.qword[i+1] := G.qword[i] -ENDFOR -dst.qword[3] := A.qword[2] -dst.qword[2] := B.qword[2] -dst.qword[1] := E.qword[2] -dst.qword[0] := F.qword[2] - - - - - SHA512 - AVX -
immintrin.h
- Cryptography -
- - - The VSM3MSG1 intrinsic is one of the two SM3 message scheduling intrinsics. The intrinsic performs an initial calculation for the next four SM3 message words. The calculated results are stored in "dst". - - -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32 - count)) - RETURN dest -} -DEFINE P1(x) { - RETURN x ^ ROL32(x, 15) ^ ROL32(x, 23) -} -W.dword[0] := __C.dword[0] -W.dword[1] := __C.dword[1] -W.dword[2] := __C.dword[2] -W.dword[3] := __C.dword[3] -W.dword[7] := __A.dword[0] -W.dword[8] := __A.dword[1] -W.dword[9] := __A.dword[2] -W.dword[10] := __A.dword[3] -W.dword[13] := __B.dword[0] -W.dword[14] := __B.dword[1] -W.dword[15] := __B.dword[2] -TMP0 := W.dword[7] ^ W.dword[0] ^ ROL32(W.dword[13], 15) -TMP1 := W.dword[8] ^ W.dword[1] ^ ROL32(W.dword[14], 15) -TMP2 := W.dword[9] ^ W.dword[2] ^ ROL32(W.dword[15], 15) -TMP3 := W.dword[10] ^ W.dword[3] -dst.dword[0] := P1(TMP0) -dst.dword[1] := P1(TMP1) -dst.dword[2] := P1(TMP2) -dst.dword[3] := P1(TMP3) - - - - - SM3 - AVX -
immintrin.h
- Cryptography -
- - - The VSM3MSG2 intrinsic is one of the two SM3 message scheduling intrinsics. The intrinsic performs the final calculation for the next four SM3 message words. The calculated results are stored in "dst". - - -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32-count)) - RETURN dest -} -WTMP.dword[0] := __A.dword[0] -WTMP.dword[1] := __A.dword[1] -WTMP.dword[2] := __A.dword[2] -WTMP.dword[3] := __A.dword[3] -W.dword[3] := __B.dword[0] -W.dword[4] := __B.dword[1] -W.dword[5] := __B.dword[2] -W.dword[6] := __B.dword[3] -W.dword[10] := __C.dword[0] -W.dword[11] := __C.dword[1] -W.dword[12] := __C.dword[2] -W.dword[13] := __C.dword[3] -W.dword[16] := ROL32(W.dword[3], 7) ^ W.dword[10] ^ WTMP.dword[0] -W.dword[17] := ROL32(W.dword[4], 7) ^ W.dword[11] ^ WTMP.dword[1] -W.dword[18] := ROL32(W.dword[5], 7) ^ W.dword[12] ^ WTMP.dword[2] -W.dword[19] := ROL32(W.dword[6], 7) ^ W.dword[13] ^ WTMP.dword[3] -W.dword[19] := W.dword[19] ^ ROL32(W.dword[16], 6) ^ ROL32(W.dword[16], 15) ^ ROL32(W.dword[16], 30) -dst.dword[0] := W.dword[16] -dst.dword[1] := W.dword[17] -dst.dword[2] := W.dword[18] -dst.dword[3] := W.dword[19] - - - - - SM3 - AVX -
immintrin.h
- Cryptography -
- - - The intrinsic performs two rounds of SM3 operation using initial SM3 state (C, D, G, H) from "__A", an initial SM3 states (A, B, E, F) from "__B" and a pre-computed words from the "__C". "__A" with initial SM3 state of (C, D, G, H) assumes input of non-rotated left variables from previous state. The updated SM3 state (A, B, E, F) is written to "__A". The "imm8" should contain the even round number for the first of the two rounds computed by this instruction. The computation masks the "imm8" value by ANDing it with 0x3E so that only even round numbers from 0 through 62 are used for this operation. The calculated results are stored in "dst". - - -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32-count)) - RETURN dest -} -DEFINE P0(x) { - RETURN x ^ ROL32(x, 9) ^ ROL32(x, 17) -} -DEFINE FF(x, y, z, round) { - IF round < 16 - RETURN (x ^ y ^ z) - ELSE - RETURN (x & y) | (x & z) | (y & z) - FI -} -DEFINE GG(x, y, z, round){ - IF round < 16 - RETURN (x ^ y ^ z) - ELSE - RETURN (x & y) | (~x & z) - FI -} -A.dword[0] := __B.dword[3] -B.dword[0] := __B.dword[2] -C.dword[0] := __A.dword[3] -D.dword[0] := __A.dword[2] -E.dword[0] := __B.dword[1] -F.dword[0] := __B.dword[0] -G.dword[0] := __A.dword[1] -H.dword[0] := __A.dword[0] -W.dword[0] := __C.dword[0] -W.dword[1] := __C.dword[1] -W.dword[4] := __C.dword[2] -W.dword[5] := __C.dword[3] -C.dword[0] := ROL32(C.dword[0], 9) -D.dword[0] := ROL32(D.dword[0], 9) -G.dword[0] := ROL32(G.dword[0], 19) -H.dword[0] := ROL32(H.dword[0], 19) -ROUND := imm8 & 0x3E -IF ROUND < 16 - CONST.dword[0] := 0x79CC4519 -ELSE - CONST.dword[0] := 0x7A879D8A -FI -CONST.dword[0] := ROL32(CONST.dword[0], ROUND) -FOR i:= 0 to 1 - temp.dword[0] := ROL32(A.dword[i], 12) + E.dword[i] + CONST.dword[0] - S1.dword[0] := ROL32(temp.dword[0], 7) - S2.dword[0] := S1.dword[0] ^ ROL32(A.dword[i], 12) - T1.dword[0] := FF(A.dword[i], B.dword[i], C.dword[i], ROUND) + D.dword[i] + S2.dword[0] + (W.dword[i] ^ W.dword[i+4]) - T2.dword[0] := GG(E.dword[i], F.dword[i], G.dword[i], ROUND) + H.dword[i] + S1.dword[0] + W.dword[i] - D.dword[i+1] := C.dword[i] - C.dword[i+1] := ROL32(B.dword[i], 9) - B.dword[i+1] := A.dword[i] - A.dword[i+1] := T1.dword[0] - H.dword[i+1] := G.dword[i] - G.dword[i+1] := ROL32(F.dword[i], 19) - F.dword[i+1] := E.dword[i] - E.dword[i+1] := P0(T2.dword[0]) - CONST.dword[0] := ROL32(CONST.dword[0], 1) -ENDFOR -dst.dword[3] := A.dword[2] -dst.dword[2] := B.dword[2] -dst.dword[1] := E.dword[2] -dst.dword[0] := F.dword[2] - - - - - - SM3 - AVX -
immintrin.h
- Cryptography -
- - - This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent 128-bit lanes. The calculated results are stored in "dst". - - -BYTE sbox[256] = { -0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, -0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, -0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, -0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, -0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, -0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, -0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, -0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, -0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, -0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, -0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, -0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, -0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, -0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, -0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, -0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 -} -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32-count)) - RETURN dest -} -DEFINE SBOX_BYTE(dword, i) { - RETURN sbox[dword.byte[i]] -} -DEFINE lower_t(dword) { - tmp.byte[0] := SBOX_BYTE(dword, 0) - tmp.byte[1] := SBOX_BYTE(dword, 1) - tmp.byte[2] := SBOX_BYTE(dword, 2) - tmp.byte[3] := SBOX_BYTE(dword, 3) - RETURN tmp -} -DEFINE L_KEY(dword) { - RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) -} -DEFINE T_KEY(dword) { - RETURN L_KEY(lower_t(dword)) -} -DEFINE F_KEY(X0, X1, X2, X3, round_key) { - RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) -} -FOR i:= 0 to 1 - P.dword[0] := __A.dword[4*i] - P.dword[1] := __A.dword[4*i+1] - P.dword[2] := __A.dword[4*i+2] - P.dword[3] := __A.dword[4*i+3] - C.dword[0] := F_KEY(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[4*i]) - C.dword[1] := F_KEY(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[4*i+1]) - C.dword[2] := F_KEY(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[4*i+2]) - C.dword[3] := F_KEY(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[4*i+3]) - dst.dword[4*i] := C.dword[0] - dst.dword[4*i+1] := C.dword[1] - dst.dword[4*i+2] := C.dword[2] - dst.dword[4*i+3] := C.dword[3] -ENDFOR -dst[MAX:256] := 0 - - - - SM4 - AVX -
immintrin.h
- Cryptography -
- - - This intrinisc performs four rounds of SM4 encryption. The intrinisc operates on independent 128-bit lanes. The calculated results are stored in "dst". - - BYTE sbox[256] = { -0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, -0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, -0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, -0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, -0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, -0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, -0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, -0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, -0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, -0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, -0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, -0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, -0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, -0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, -0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, -0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 -} -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32-count)) - RETURN dest -} -DEFINE SBOX_BYTE(dword, i) { - RETURN sbox[dword.byte[i]] -} -DEFINE lower_t(dword) { - tmp.byte[0] := SBOX_BYTE(dword, 0) - tmp.byte[1] := SBOX_BYTE(dword, 1) - tmp.byte[2] := SBOX_BYTE(dword, 2) - tmp.byte[3] := SBOX_BYTE(dword, 3) - RETURN tmp -} -DEFINE L_RND(dword) { - tmp := dword - tmp := tmp ^ ROL32(dword, 2) - tmp := tmp ^ ROL32(dword, 10) - tmp := tmp ^ ROL32(dword, 18) - tmp := tmp ^ ROL32(dword, 24) - RETURN tmp -} -DEFINE T_RND(dword) { - RETURN L_RND(lower_t(dword)) -} -DEFINE F_RND(X0, X1, X2, X3, round_key) { - RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) -} -FOR i:= 0 to 1 - P.dword[0] := __A.dword[4*i] - P.dword[1] := __A.dword[4*i+1] - P.dword[2] := __A.dword[4*i+2] - P.dword[3] := __A.dword[4*i+3] - C.dword[0] := F_RND(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[4*i]) - C.dword[1] := F_RND(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[4*i+1]) - C.dword[2] := F_RND(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[4*i+2]) - C.dword[3] := F_RND(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[4*i+3]) - dst.dword[4*i] := C.dword[0] - dst.dword[4*i+1] := C.dword[1] - dst.dword[4*i+2] := C.dword[2] - dst.dword[4*i+3] := C.dword[3] -ENDFOR -dst[MAX:256] := 0 - - - - SM4 - AVX -
immintrin.h
- Cryptography -
- - - This intrinsic performs four rounds of SM4 key expansion. The intrinsic operates on independent 128-bit lanes. The calculated results are stored in "dst". - - -BYTE sbox[256] = { -0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, -0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, -0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, -0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, -0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, -0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, -0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, -0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, -0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, -0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, -0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, -0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, -0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, -0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, -0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, -0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 -} -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32-count)) - RETURN dest -} -DEFINE SBOX_BYTE(dword, i) { - RETURN sbox[dword.byte[i]] -} -DEFINE lower_t(dword) { - tmp.byte[0] := SBOX_BYTE(dword, 0) - tmp.byte[1] := SBOX_BYTE(dword, 1) - tmp.byte[2] := SBOX_BYTE(dword, 2) - tmp.byte[3] := SBOX_BYTE(dword, 3) - RETURN tmp -} -DEFINE L_KEY(dword) { - RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) -} -DEFINE T_KEY(dword) { - RETURN L_KEY(lower_t(dword)) -} -DEFINE F_KEY(X0, X1, X2, X3, round_key) { - RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) -} -P.dword[0] := __A.dword[0] -P.dword[1] := __A.dword[1] -P.dword[2] := __A.dword[2] -P.dword[3] := __A.dword[3] -C.dword[0] := F_KEY(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[0]) -C.dword[1] := F_KEY(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[1]) -C.dword[2] := F_KEY(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[2]) -C.dword[3] := F_KEY(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[3]) -dst.dword[0] := C.dword[0] -dst.dword[1] := C.dword[1] -dst.dword[2] := C.dword[2] -dst.dword[3] := C.dword[3] -dst[MAX:128] := 0 - - - - SM4 - AVX -
immintrin.h
- Cryptography -
- - - This intrinisc performs four rounds of SM4 encryption. The intrinisc operates on independent 128-bit lanes. The calculated results are stored in "dst". - - -BYTE sbox[256] = { -0xD6, 0x90, 0xE9, 0xFE, 0xCC, 0xE1, 0x3D, 0xB7, 0x16, 0xB6, 0x14, 0xC2, 0x28, 0xFB, 0x2C, 0x05, -0x2B, 0x67, 0x9A, 0x76, 0x2A, 0xBE, 0x04, 0xC3, 0xAA, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, -0x9C, 0x42, 0x50, 0xF4, 0x91, 0xEF, 0x98, 0x7A, 0x33, 0x54, 0x0B, 0x43, 0xED, 0xCF, 0xAC, 0x62, -0xE4, 0xB3, 0x1C, 0xA9, 0xC9, 0x08, 0xE8, 0x95, 0x80, 0xDF, 0x94, 0xFA, 0x75, 0x8F, 0x3F, 0xA6, -0x47, 0x07, 0xA7, 0xFC, 0xF3, 0x73, 0x17, 0xBA, 0x83, 0x59, 0x3C, 0x19, 0xE6, 0x85, 0x4F, 0xA8, -0x68, 0x6B, 0x81, 0xB2, 0x71, 0x64, 0xDA, 0x8B, 0xF8, 0xEB, 0x0F, 0x4B, 0x70, 0x56, 0x9D, 0x35, -0x1E, 0x24, 0x0E, 0x5E, 0x63, 0x58, 0xD1, 0xA2, 0x25, 0x22, 0x7C, 0x3B, 0x01, 0x21, 0x78, 0x87, -0xD4, 0x00, 0x46, 0x57, 0x9F, 0xD3, 0x27, 0x52, 0x4C, 0x36, 0x02, 0xE7, 0xA0, 0xC4, 0xC8, 0x9E, -0xEA, 0xBF, 0x8A, 0xD2, 0x40, 0xC7, 0x38, 0xB5, 0xA3, 0xF7, 0xF2, 0xCE, 0xF9, 0x61, 0x15, 0xA1, -0xE0, 0xAE, 0x5D, 0xA4, 0x9B, 0x34, 0x1A, 0x55, 0xAD, 0x93, 0x32, 0x30, 0xF5, 0x8C, 0xB1, 0xE3, -0x1D, 0xF6, 0xE2, 0x2E, 0x82, 0x66, 0xCA, 0x60, 0xC0, 0x29, 0x23, 0xAB, 0x0D, 0x53, 0x4E, 0x6F, -0xD5, 0xDB, 0x37, 0x45, 0xDE, 0xFD, 0x8E, 0x2F, 0x03, 0xFF, 0x6A, 0x72, 0x6D, 0x6C, 0x5B, 0x51, -0x8D, 0x1B, 0xAF, 0x92, 0xBB, 0xDD, 0xBC, 0x7F, 0x11, 0xD9, 0x5C, 0x41, 0x1F, 0x10, 0x5A, 0xD8, -0x0A, 0xC1, 0x31, 0x88, 0xA5, 0xCD, 0x7B, 0xBD, 0x2D, 0x74, 0xD0, 0x12, 0xB8, 0xE5, 0xB4, 0xB0, -0x89, 0x69, 0x97, 0x4A, 0x0C, 0x96, 0x77, 0x7E, 0x65, 0xB9, 0xF1, 0x09, 0xC5, 0x6E, 0xC6, 0x84, -0x18, 0xF0, 0x7D, 0xEC, 0x3A, 0xDC, 0x4D, 0x20, 0x79, 0xEE, 0x5F, 0x3E, 0xD7, 0xCB, 0x39, 0x48 -} -DEFINE ROL32(dword, n) { - count := n % 32 - dest := (dword << count) | (dword >> (32-count)) - RETURN dest -} -DEFINE SBOX_BYTE(dword, i) { - RETURN sbox[dword.byte[i]] -} -DEFINE lower_t(dword) { - tmp.byte[0] := SBOX_BYTE(dword, 0) - tmp.byte[1] := SBOX_BYTE(dword, 1) - tmp.byte[2] := SBOX_BYTE(dword, 2) - tmp.byte[3] := SBOX_BYTE(dword, 3) - RETURN tmp -} -DEFINE L_RND(dword) { - tmp := dword - tmp := tmp ^ ROL32(dword, 2) - tmp := tmp ^ ROL32(dword, 10) - tmp := tmp ^ ROL32(dword, 18) - tmp := tmp ^ ROL32(dword, 24) - RETURN tmp -} -DEFINE T_RND(dword) { - RETURN L_RND(lower_t(dword)) -} -DEFINE F_RND(X0, X1, X2, X3, round_key) { - RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) -} -P.dword[0] := __A.dword[0] -P.dword[1] := __A.dword[1] -P.dword[2] := __A.dword[2] -P.dword[3] := __A.dword[3] -C.dword[0] := F_RND(P.dword[0], P.dword[1], P.dword[2], P.dword[3], __B.dword[0]) -C.dword[1] := F_RND(P.dword[1], P.dword[2], P.dword[3], C.dword[0], __B.dword[1]) -C.dword[2] := F_RND(P.dword[2], P.dword[3], C.dword[0], C.dword[1], __B.dword[2]) -C.dword[3] := F_RND(P.dword[3], C.dword[0], C.dword[1], C.dword[2], __B.dword[3]) -dst.dword[0] := C.dword[0] -dst.dword[1] := C.dword[1] -dst.dword[2] := C.dword[2] -dst.dword[3] := C.dword[3] -dst[MAX:128] := 0 - - - - SM4 - AVX -
immintrin.h
- Cryptography -
- - - - Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ACOS(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ACOS(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ACOSH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ACOSH(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ASIN(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ASIN(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ASINH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ASINH(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ATAN(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ATAN(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - - Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ATAN2(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - - Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in "a" divided by packed elements in "b", and store the results in "dst" expressed in radians. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ATAN2(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ATANH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ATANH(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := COSD(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := COSD(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := COSH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := COSH(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SQRT(POW(a[i+63:i], 2.0) + POW(b[i+63:i], 2.0)) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - - Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SQRT(POW(a[i+31:i], 2.0) + POW(b[i+31:i], 2.0)) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - - Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SIN(a[i+63:i]) - MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - - Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, store the sine in "dst", and store the cosine into memory at "mem_addr". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SIN(a[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SIND(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the sine of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SIND(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SINH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SINH(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TAN(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TAN(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TAND(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in degrees, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TAND(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TANH(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in "a" expressed in radians, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TANH(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Trigonometry -
- - - - Compute the cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := CubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CEXP(a[31:0], b[31:0]) { - result[31:0] := POW(FP32(e), a[31:0]) * COS(b[31:0]) - result[63:32] := POW(FP32(e), a[31:0]) * SIN(b[31:0]) - RETURN result -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CEXP(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed complex numbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CLOG(a[31:0], b[31:0]) { - result[31:0] := LOG(SQRT(POW(a, 2.0) + POW(b, 2.0))) - result[63:32] := ATAN2(b, a) - RETURN result -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CLOG(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed complex snumbers in "a", and store the complex results in "dst". Each complex number is composed of two adjacent single-precision (32-bit) floating-point elements, which defines the complex number "complex = vec.fp32[0] + i * vec.fp32[1]". - -DEFINE CSQRT(a[31:0], b[31:0]) { - sign[31:0] := (b < 0.0) ? -FP32(1.0) : FP32(1.0) - result[31:0] := SQRT((a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - result[63:32] := sign * SQRT((-a + SQRT(POW(a, 2.0) + POW(b, 2.0))) / 2.0) - RETURN result -} -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CSQRT(a[i+31:i], a[i+63:i+32]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POW(10.0, a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := POW(FP32(10.0), a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POW(2.0, a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := POW(FP32(2.0), a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed double-precision (64-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POW(e, a[i+63:i]) - 1.0 -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the exponential value of "e" raised to the power of packed single-precision (32-bit) floating-point elements in "a", subtract one from each element, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := POW(FP32(e), a[i+31:i]) - 1.0 -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := InvCubeRoot(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := InvCubeRoot(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := InvSQRT(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the inverse square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := InvSQRT(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(10.0) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(10.0) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(1.0 + a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LOG(1.0 + a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := LOG(a[i+63:i]) / LOG(2.0) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := LOG(a[i+31:i]) / LOG(2.0) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Convert the exponent of each packed double-precision (64-bit) floating-point element in "a" to a double-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ConvertExpFP64(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Convert the exponent of each packed single-precision (32-bit) floating-point element in "a" to a single-precision floating-point number representing the integer exponent, and store the results in "dst". This intrinsic essentially calculates "floor(log2(x))" for each element. - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ConvertExpFP32(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the exponential value of packed double-precision (64-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := POW(a[i+63:i], b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - - Compute the exponential value of packed single-precision (32-bit) floating-point elements in "a" raised by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := POW(a[i+31:i], b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_pd". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". Note that this intrinsic is less efficient than "_mm_sqrt_ps". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Elementary Math Functions -
- - - - Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := CDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := InverseCDFNormal(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in "a" using the normal distribution, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := InverseCDFNormal(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ERF(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := 1.0 - ERF(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+63:i] := 1.0 - ERF(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i])) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+31:i])) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := 1.0 / ERF(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - Compute the inverse error function of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+63:i] := 1.0 / ERF(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Probability/Statistics -
- - - - - Divide packed signed 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 7 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 3 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed signed 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 1 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 15 - i := 8*j - IF b[i+7:i] == 0 - #DE - FI - dst[i+7:i] := Truncate8(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 7 - i := 16*j - IF b[i+15:i] == 0 - #DE - FI - dst[i+15:i] := Truncate16(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 3 - i := 32*j - IF b[i+31:i] == 0 - #DE - FI - dst[i+31:i] := Truncate32(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - -FOR j := 0 to 1 - i := 64*j - IF b[i+63:i] == 0 - #DE - FI - dst[i+63:i] := Truncate64(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - Compute the error function of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ERF(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - - Divide packed 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed 32-bit integers into memory at "mem_addr". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 8-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 15 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 16-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 7 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 32-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed 64-bit integers in "a" by packed elements in "b", and store the remainders as packed 32-bit integers in "dst". - FOR j := 0 to 1 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 8-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 15 - i := 8*j - dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 16-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 7 - i := 16*j - dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 64-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 1 - i := 64*j - dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the truncated results in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", store the truncated results in "dst", and store the remainders as packed unsigned 32-bit integers into memory at "mem_addr". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i]) - MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - - Divide packed unsigned 32-bit integers in "a" by packed elements in "b", and store the remainders as packed unsigned 32-bit integers in "dst". - FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Arithmetic -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" to the nearest integer value, and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Special Math Functions -
- - - - Truncate the packed double-precision (64-bit) floating-point elements in "a", and store the results as packed double-precision floating-point elements in "dst". This intrinsic may generate the "roundpd"/"vroundpd" instruction. - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := TRUNCATE(a[i+63:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Miscellaneous -
- - - - Truncate the packed single-precision (32-bit) floating-point elements in "a", and store the results as packed single-precision floating-point elements in "dst". This intrinsic may generate the "roundps"/"vroundps" instruction. - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := TRUNCATE(a[i+31:i]) -ENDFOR -dst[MAX:128] := 0 - - SSE -
immintrin.h
- Miscellaneous -
- - - - - - - - - Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in "row0", "row1", "row2", and "row3", and store the transposed matrix in these vectors ("row0" now contains column 0, etc.). - -__m128 tmp3, tmp2, tmp1, tmp0; -tmp0 := _mm_unpacklo_ps(row0, row1); -tmp2 := _mm_unpacklo_ps(row2, row3); -tmp1 := _mm_unpackhi_ps(row0, row1); -tmp3 := _mm_unpackhi_ps(row2, row3); -row0 := _mm_movelh_ps(tmp0, tmp2); -row1 := _mm_movehl_ps(tmp2, tmp0); -row2 := _mm_movelh_ps(tmp1, tmp3); -row3 := _mm_movehl_ps(tmp3, tmp1); - - SSE -
xmmintrin.h
- Swizzle -
- - - - - Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - -dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] -dst[31:16] := 0 - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - -dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0] -dst[31:16] := 0 - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". - -dst[63:0] := a[63:0] -sel := imm8[1:0]*16 -dst[sel+15:sel] := i[15:0] - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". - -dst[63:0] := a[63:0] -sel := imm8[1:0]*16 -dst[sel+15:sel] := i[15:0] - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[15:0] := src[15:0] - 1: tmp[15:0] := src[31:16] - 2: tmp[15:0] := src[47:32] - 3: tmp[15:0] := src[63:48] - ESAC - RETURN tmp[15:0] -} -dst[15:0] := SELECT4(a[63:0], imm8[1:0]) -dst[31:16] := SELECT4(a[63:0], imm8[3:2]) -dst[47:32] := SELECT4(a[63:0], imm8[5:4]) -dst[63:48] := SELECT4(a[63:0], imm8[7:6]) - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - Shuffle 16-bit integers in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[15:0] := src[15:0] - 1: tmp[15:0] := src[31:16] - 2: tmp[15:0] := src[47:32] - 3: tmp[15:0] := src[63:48] - ESAC - RETURN tmp[15:0] -} -dst[15:0] := SELECT4(a[63:0], imm8[1:0]) -dst[31:16] := SELECT4(a[63:0], imm8[3:2]) -dst[47:32] := SELECT4(a[63:0], imm8[5:4]) -dst[63:48] := SELECT4(a[63:0], imm8[7:6]) - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - - Shuffle single-precision (32-bit) floating-point elements in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(b[127:0], imm8[5:4]) -dst[127:96] := SELECT4(b[127:0], imm8[7:6]) - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the high half "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) - - - SSE -
xmmintrin.h
- Swizzle -
- - - - - Unpack and interleave single-precision (32-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) - - - SSE -
xmmintrin.h
- Swizzle -
- - - - Get the unsigned 32-bit value of the MXCSR control and status register. - dst[31:0] := MXCSR - - - SSE -
immintrin.h
- General Support -
- - - - Set the MXCSR control and status register with the value in unsigned 32-bit integer "a". - -MXCSR := a[31:0] - - - SSE -
immintrin.h
- General Support -
- - - Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT - dst[31:0] := MXCSR & _MM_EXCEPT_MASK - - SSE -
immintrin.h
- General Support -
- - - - Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT - MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK - - SSE -
immintrin.h
- General Support -
- - - Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT - dst[31:0] := MXCSR & _MM_MASK_MASK - - SSE -
immintrin.h
- General Support -
- - - - Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT - MXCSR := a[31:0] AND ~_MM_MASK_MASK - - SSE -
immintrin.h
- General Support -
- - - Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO - dst[31:0] := MXCSR & _MM_ROUND_MASK - - SSE -
immintrin.h
- General Support -
- - - - Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO - MXCSR := a[31:0] AND ~_MM_ROUND_MASK - - SSE -
immintrin.h
- General Support -
- - - Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF - dst[31:0] := MXCSR & _MM_FLUSH_MASK - - SSE -
immintrin.h
- General Support -
- - - - Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer "a". The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF - MXCSR := a[31:0] AND ~_MM_FLUSH_MASK - - SSE -
immintrin.h
- General Support -
- - - - - Fetch the line of data from memory that contains address "p" to a location in the cache hierarchy specified by the locality hint "i", which can be one of:<ul> - <li>_MM_HINT_T0 // 3, move data using the T0 hint. The PREFETCHT0 instruction will be generated.</li> - <li>_MM_HINT_T1 // 2, move data using the T1 hint. The PREFETCHT1 instruction will be generated.</li> - <li>_MM_HINT_T2 // 1, move data using the T2 hint. The PREFETCHT2 instruction will be generated.</li> - <li>_MM_HINT_NTA // 0, move data using the non-temporal access (NTA) hint. The PREFETCHNTA instruction will be generated.</li> - - - - - - SSE -
immintrin.h
- General Support -
- - - - Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order. - - SSE -
immintrin.h
- General Support -
- - - - - Allocate "size" bytes of memory, aligned to the alignment specified in "align", and return a pointer to the allocated memory. "_mm_free" should be used to free memory that is allocated with "_mm_malloc". - SSE -
immintrin.h
- General Support -
- - - - Free aligned memory that was allocated with "_mm_malloc". - SSE -
immintrin.h
- General Support -
- - - - Return vector of type __m128 with undefined elements. - SSE -
immintrin.h
- General Support -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [min_float_note] - -dst[31:0] := MIN(a[31:0], b[31:0]) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper element of "dst". [max_float_note] - -dst[31:0] := MAX(a[31:0], b[31:0]) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Special Math Functions -
- - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] -ENDFOR - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] -ENDFOR - - - SSE -
xmmintrin.h
- Arithmetic -
- - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". - -FOR j := 0 to 7 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] -dst[63:16] := 0 - - - SSE -
xmmintrin.h
- Arithmetic -
- - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of "dst". - -FOR j := 0 to 7 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56] -dst[63:16] := 0 - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Add the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] + b[31:0] -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Add packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Subtract the lower single-precision (32-bit) floating-point element in "b" from the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] - b[31:0] -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Subtract packed single-precision (32-bit) floating-point elements in "b" from packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Multiply the lower single-precision (32-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] * b[31:0] -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Multiply packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] * b[i+31:i] -ENDFOR - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Divide the lower single-precision (32-bit) floating-point element in "a" by the lower single-precision (32-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := a[31:0] / b[31:0] -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Divide packed single-precision (32-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := a[i+31:i] / b[i+31:i] -ENDFOR - - - SSE -
xmmintrin.h
- Arithmetic -
- - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR - - - SSE -
xmmintrin.h
- Probability/Statistics -
- - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR - - - SSE -
xmmintrin.h
- Probability/Statistics -
- - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR - - - SSE -
xmmintrin.h
- Probability/Statistics -
- - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR - - - SSE -
xmmintrin.h
- Probability/Statistics -
- - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Convert -
- - - - - Convert the signed 32-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Convert -
- - - - - Convert the signed 64-bit integer "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - SSE -
xmmintrin.h
- Convert -
- - - - - Convert packed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] - - - SSE -
xmmintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "b" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", and copy the upper 2 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - m := j*32 - dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) -ENDFOR - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed unsigned 16-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*16 - m := j*32 - dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) -ENDFOR - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower packed 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*8 - m := j*32 - dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) -ENDFOR - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower packed unsigned 8-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := j*8 - m := j*32 - dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) -ENDFOR - - SSE -
xmmintrin.h
- Convert -
- - - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of "dst", then covert the packed signed 32-bit integers in "b" to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of "dst". - -dst[31:0] := Convert_Int32_To_FP32(a[31:0]) -dst[63:32] := Convert_Int32_To_FP32(a[63:32]) -dst[95:64] := Convert_Int32_To_FP32(b[31:0]) -dst[127:96] := Convert_Int32_To_FP32(b[63:32]) - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32(a[31:0]) - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64(a[31:0]) - - - SSE -
xmmintrin.h
- Convert -
- - - - Copy the lower single-precision (32-bit) floating-point element of "a" to "dst". - -dst[31:0] := a[31:0] - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert the lower single-precision (32-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 16-bit integers, and store the results in "dst". Note: this intrinsic will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and 0x7FFFFFFF. - -FOR j := 0 to 3 - i := 16*j - k := 32*j - IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF) - dst[i+15:i] := 0x7FFF - ELSE - dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k]) - FI -ENDFOR - - SSE -
xmmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 8-bit integers, and store the results in lower 4 elements of "dst". Note: this intrinsic will generate 0x7F, rather than 0x80, for input values between 0x7F and 0x7FFFFFFF. - -FOR j := 0 to 3 - i := 8*j - k := 32*j - IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF) - dst[i+7:i] := 0x7F - ELSE - dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k]) - FI -ENDFOR - - SSE -
xmmintrin.h
- Convert -
- - - - - Store 64-bits of integer data from "a" into memory using a non-temporal memory hint. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - SSE -
immintrin.h
- Store -
- - - - - - Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. - -FOR j := 0 to 7 - i := j*8 - IF mask[i+7] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - - SSE -
immintrin.h
- Store -
- - - - - - Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element). - -FOR j := 0 to 7 - i := j*8 - IF mask[i+7] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - - SSE -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE -
immintrin.h
- Store -
- - - - - Store the upper 2 single-precision (32-bit) floating-point elements from "a" into memory. - -MEM[mem_addr+31:mem_addr] := a[95:64] -MEM[mem_addr+63:mem_addr+32] := a[127:96] - - - SSE -
immintrin.h
- Store -
- - - - - Store the lower 2 single-precision (32-bit) floating-point elements from "a" into memory. - -MEM[mem_addr+31:mem_addr] := a[31:0] -MEM[mem_addr+63:mem_addr+32] := a[63:32] - - - SSE -
immintrin.h
- Store -
- - - - - Store the lower single-precision (32-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+31:mem_addr] := a[31:0] - - - SSE -
immintrin.h
- Store -
- - - - - Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+31:mem_addr] := a[31:0] -MEM[mem_addr+63:mem_addr+32] := a[31:0] -MEM[mem_addr+95:mem_addr+64] := a[31:0] -MEM[mem_addr+127:mem_addr+96] := a[31:0] - - SSE -
immintrin.h
- Store -
- - - - - Store the lower single-precision (32-bit) floating-point element from "a" into 4 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+31:mem_addr] := a[31:0] -MEM[mem_addr+63:mem_addr+32] := a[31:0] -MEM[mem_addr+95:mem_addr+64] := a[31:0] -MEM[mem_addr+127:mem_addr+96] := a[31:0] - - SSE -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE -
immintrin.h
- Store -
- - - - - Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE -
immintrin.h
- Store -
- - - - - Store 4 single-precision (32-bit) floating-point elements from "a" into memory in reverse order. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+31:mem_addr] := a[127:96] -MEM[mem_addr+63:mem_addr+32] := a[95:64] -MEM[mem_addr+95:mem_addr+64] := a[63:32] -MEM[mem_addr+127:mem_addr+96] := a[31:0] - - - SSE -
immintrin.h
- Store -
- - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[j] := a[i+7] -ENDFOR -dst[MAX:8] := 0 - - - SSE -
xmmintrin.h
- Miscellaneous -
- - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[j] := a[i+7] -ENDFOR -dst[MAX:8] := 0 - - - SSE -
xmmintrin.h
- Miscellaneous -
- - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in "a". - -FOR j := 0 to 3 - i := j*32 - IF a[i+31] - dst[j] := 1 - ELSE - dst[j] := 0 - FI -ENDFOR -dst[MAX:4] := 0 - - - SSE -
xmmintrin.h
- Miscellaneous -
- - - - Compute the square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := SQRT(a[31:0]) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SQRT(a[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -dst[31:0] := (1.0 / a[31:0]) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / a[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -dst[31:0] := (1.0 / SQRT(a[31:0])) -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Elementary Math Functions -
- - - - Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in "a", and store the results in "dst". The maximum relative error for this approximation is less than 1.5*2^-12. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (1.0 / SQRT(a[i+31:i])) -ENDFOR - - - SSE -
xmmintrin.h
- Elementary Math Functions -
- - - - - Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (a[i+31:i] AND b[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Logical -
- - - - - Compute the bitwise NOT of packed single-precision (32-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i]) -ENDFOR - - - SSE -
xmmintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] OR b[i+31:i] -ENDFOR - - - SSE -
xmmintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] XOR b[i+31:i] -ENDFOR - - - SSE -
xmmintrin.h
- Logical -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := (!( a[31:0] < b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := (!( a[31:0] <= b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (!( a[i+31:i] <= b[i+31:i] )) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := (!( a[31:0] > b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (!( a[i+31:i] > b[i+31:i] )) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := (!( a[31:0] >= b[31:0] )) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := (!( a[i+31:i] >= b[i+31:i] )) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - dst[31:0] := ( a[31:0] == NaN OR b[31:0] == NaN ) ? 0xFFFFFFFF : 0 -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare packed single-precision (32-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". - FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == NaN OR b[i+31:i] == NaN ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] < b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] <= b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] > b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] >= b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). - RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] == b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] < b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] <= b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] > b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[31:0] != NaN AND b[31:0] != NaN AND a[31:0] >= b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - - Compare the lower single-precision (32-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[31:0] == NaN OR b[31:0] == NaN OR a[31:0] != b[31:0] ) ? 1 : 0 - - - SSE -
xmmintrin.h
- Compare -
- - - - Copy single-precision (32-bit) floating-point element "a" to the lower element of "dst", and zero the upper 3 elements. - -dst[31:0] := a[31:0] -dst[127:32] := 0 - - SSE -
xmmintrin.h
- Set -
- - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR - - SSE -
xmmintrin.h
- Set -
- - - - Broadcast single-precision (32-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR - - SSE -
xmmintrin.h
- Set -
- - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 - - SSE -
xmmintrin.h
- Set -
- - - - - - - Set packed single-precision (32-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[31:0] := e3 -dst[63:32] := e2 -dst[95:64] := e1 -dst[127:96] := e0 - - SSE -
xmmintrin.h
- Set -
- - - - Return vector of type __m128 with all elements set to zero. - -dst[MAX:0] := 0 - - - SSE -
xmmintrin.h
- Set -
- - - - - Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of "dst", and copy the lower 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - -dst[31:0] := a[31:0] -dst[63:32] := a[63:32] -dst[95:64] := MEM[mem_addr+31:mem_addr] -dst[127:96] := MEM[mem_addr+63:mem_addr+32] - - - SSE -
immintrin.h
- Load -
- - - - - Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of "dst", and copy the upper 2 elements from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[63:32] := MEM[mem_addr+63:mem_addr+32] -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] - - - SSE -
immintrin.h
- Load -
- - - - Load a single-precision (32-bit) floating-point element from memory into the lower of "dst", and zero the upper 3 elements. "mem_addr" does not need to be aligned on any particular boundary. - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[127:32] := 0 - - - SSE -
immintrin.h
- Load -
- - - - Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[63:32] := MEM[mem_addr+31:mem_addr] -dst[95:64] := MEM[mem_addr+31:mem_addr] -dst[127:96] := MEM[mem_addr+31:mem_addr] - - SSE -
immintrin.h
- Load -
- - - - Load a single-precision (32-bit) floating-point element from memory into all elements of "dst". - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[63:32] := MEM[mem_addr+31:mem_addr] -dst[95:64] := MEM[mem_addr+31:mem_addr] -dst[127:96] := MEM[mem_addr+31:mem_addr] - - SSE -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE -
immintrin.h
- Load -
- - - - Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE -
immintrin.h
- Load -
- - - - Load 4 single-precision (32-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[31:0] := MEM[mem_addr+127:mem_addr+96] -dst[63:32] := MEM[mem_addr+95:mem_addr+64] -dst[95:64] := MEM[mem_addr+63:mem_addr+32] -dst[127:96] := MEM[mem_addr+31:mem_addr] - - SSE -
immintrin.h
- Load -
- - - - - Move the lower single-precision (32-bit) floating-point element from "b" to the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := b[31:0] -dst[127:32] := a[127:32] - - - SSE -
xmmintrin.h
- Move -
- - - - - Move the upper 2 single-precision (32-bit) floating-point elements from "b" to the lower 2 elements of "dst", and copy the upper 2 elements from "a" to the upper 2 elements of "dst". - -dst[31:0] := b[95:64] -dst[63:32] := b[127:96] -dst[95:64] := a[95:64] -dst[127:96] := a[127:96] - - - SSE -
xmmintrin.h
- Move -
- - - - - Move the lower 2 single-precision (32-bit) floating-point elements from "b" to the upper 2 elements of "dst", and copy the lower 2 elements from "a" to the lower 2 elements of "dst". - -dst[31:0] := a[31:0] -dst[63:32] := a[63:32] -dst[95:64] := b[31:0] -dst[127:96] := b[63:32] - - - SSE -
xmmintrin.h
- Move -
- - - - - - Return vector of type __m128d with undefined elements. - SSE2 -
emmintrin.h
- General Support -
- - - - Return vector of type __m128i with undefined elements. - SSE2 -
emmintrin.h
- General Support -
- - - - Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops. - - SSE2 -
emmintrin.h
- General Support -
- - - - Invalidate and flush the cache line that contains "p" from all levels of the cache hierarchy. - - SSE2 -
emmintrin.h
- General Support -
- - - - Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order. - - SSE2 -
emmintrin.h
- General Support -
- - - - Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order. - - SSE2 -
emmintrin.h
- General Support -
- - - - Load unaligned 64-bit integer from memory into the first element of "dst". - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[MAX:64] := 0 - - - SSE2 -
immintrin.h
- Load -
- - - - Load unaligned 16-bit integer from memory into the first element of "dst". - -dst[15:0] := MEM[mem_addr+15:mem_addr] -dst[MAX:16] := 0 - - SSE2 -
immintrin.h
- Load -
- - - - Load unaligned 32-bit integer from memory into the first element of "dst". - -dst[31:0] := MEM[mem_addr+31:mem_addr] -dst[MAX:32] := 0 - - - SSE2 -
emmintrin.h
- Load -
- - - - Load 64-bit integer from memory into the first element of "dst". - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[MAX:64] := 0 - - - SSE2 -
emmintrin.h
- Load -
- - - - Load 128-bits of integer data from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load 128-bits of integer data from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := MEM[mem_addr+63:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := MEM[mem_addr+63:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load 2 double-precision (64-bit) floating-point elements from memory into "dst" in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[63:0] := MEM[mem_addr+127:mem_addr+64] -dst[127:64] := MEM[mem_addr+63:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into "dst". - "mem_addr" does not need to be aligned on any particular boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - Load a double-precision (64-bit) floating-point element from memory into the lower of "dst", and zero the upper element. "mem_addr" does not need to be aligned on any particular boundary. - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := 0 - - - SSE2 -
emmintrin.h
- Load -
- - - - - Load a double-precision (64-bit) floating-point element from memory into the upper element of "dst", and copy the lower element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - -dst[63:0] := a[63:0] -dst[127:64] := MEM[mem_addr+63:mem_addr] - - - SSE2 -
emmintrin.h
- Load -
- - - - - Load a double-precision (64-bit) floating-point element from memory into the lower element of "dst", and copy the upper element from "a" to "dst". "mem_addr" does not need to be aligned on any particular boundary. - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Load -
- - - - - Store 16-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+15:mem_addr] := a[15:0] - - SSE2 -
immintrin.h
- Store -
- - - - - Store 64-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - SSE2 -
immintrin.h
- Store -
- - - - - Store 32-bit integer from the first element of "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+31:mem_addr] := a[31:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - - Conditionally store 8-bit integer elements from "a" into memory using "mask" (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. "mem_addr" does not need to be aligned on any particular boundary. - -FOR j := 0 to 15 - i := j*8 - IF mask[i+7] - MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i] - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 128-bits of integer data from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 128-bits of integer data from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 64-bit integer from the first element of "a" into memory. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 128-bits of integer data from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 32-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. - -MEM[mem_addr+31:mem_addr] := a[31:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 64-bit integer "a" into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address "mem_addr" is already in the cache, the cache will be updated. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory using a non-temporal memory hint. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store the lower double-precision (64-bit) floating-point element from "a" into memory. "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+63:mem_addr] := a[63:0] -MEM[mem_addr+127:mem_addr+64] := a[63:0] - - SSE2 -
emmintrin.h
- Store -
- - - - - Store the lower double-precision (64-bit) floating-point element from "a" into 2 contiguous elements in memory. "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+63:mem_addr] := a[63:0] -MEM[mem_addr+127:mem_addr+64] := a[63:0] - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from "a" into memory. - "mem_addr" does not need to be aligned on any particular boundary. - -MEM[mem_addr+127:mem_addr] := a[127:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store 2 double-precision (64-bit) floating-point elements from "a" into memory in reverse order. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -MEM[mem_addr+63:mem_addr] := a[127:64] -MEM[mem_addr+127:mem_addr+64] := a[63:0] - - SSE2 -
emmintrin.h
- Store -
- - - - - Store the upper double-precision (64-bit) floating-point element from "a" into memory. - -MEM[mem_addr+63:mem_addr] := a[127:64] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Store the lower double-precision (64-bit) floating-point element from "a" into memory. - -MEM[mem_addr+63:mem_addr] := a[63:0] - - - SSE2 -
emmintrin.h
- Store -
- - - - - Add packed 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[i+7:i] + b[i+7:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[i+15:i] + b[i+15:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed 32-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] + b[i+31:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add 64-bit integers "a" and "b", and store the result in "dst". - -dst[63:0] := a[63:0] + b[63:0] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed 64-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed signed 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed signed 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed unsigned 8-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := SaturateU8( a[i+7:i] + b[i+7:i] ) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed unsigned 16-bit integers in "a" and "b" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SaturateU16( a[i+15:i] + b[i+15:i] ) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := SignExtend32(a[i+31:i+16]*b[i+31:i+16]) + SignExtend32(a[i+15:i]*b[i+15:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply the packed signed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[31:16] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply the packed unsigned 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := a[i+15:i] * b[i+15:i] - dst[i+15:i] := tmp[31:16] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply the packed 16-bit integers in "a" and "b", producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i]) - dst[i+15:i] := tmp[15:0] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply the low unsigned 32-bit integers from "a" and "b", and store the unsigned 64-bit result in "dst". - -dst[63:0] := a[31:0] * b[31:0] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply the low unsigned 32-bit integers from each packed 64-bit element in "a" and "b", and store the unsigned 64-bit results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+31:i] * b[i+31:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - Miscellaneous - - - - Compute the absolute differences of packed unsigned 8-bit integers in "a" and "b", then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in "dst". - -FOR j := 0 to 15 - i := j*8 - tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -ENDFOR -FOR j := 0 to 1 - i := j*64 - dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + \ - tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56] - dst[i+63:i+16] := 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed 8-bit integers in "b" from packed 8-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[i+7:i] - b[i+7:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed 16-bit integers in "b" from packed 16-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[i+15:i] - b[i+15:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed 32-bit integers in "b" from packed 32-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[i+31:i] - b[i+31:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract 64-bit integer "b" from 64-bit integer "a", and store the result in "dst". - -dst[63:0] := a[63:0] - b[63:0] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed 64-bit integers in "b" from packed 64-bit integers in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed signed 8-bit integers in "b" from packed 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := Saturate8(a[i+7:i] - b[i+7:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed signed 16-bit integers in "b" from packed 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := Saturate16(a[i+15:i] - b[i+15:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 8-bit integers in "b" from packed unsigned 8-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := SaturateU8(a[i+7:i] - b[i+7:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed unsigned 16-bit integers in "b" from packed unsigned 16-bit integers in "a" using saturation, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := SaturateU16(a[i+15:i] - b[i+15:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := a[63:0] + b[63:0] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Add packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] + b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Divide the lower double-precision (64-bit) floating-point element in "a" by the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := a[63:0] / b[63:0] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Divide packed double-precision (64-bit) floating-point elements in "a" by packed elements in "b", and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - dst[i+63:i] := a[i+63:i] / b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply the lower double-precision (64-bit) floating-point element in "a" and "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := a[63:0] * b[63:0] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Multiply packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] * b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract the lower double-precision (64-bit) floating-point element in "b" from the lower double-precision (64-bit) floating-point element in "a", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := a[63:0] - b[63:0] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Subtract packed double-precision (64-bit) floating-point elements in "b" from packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] - b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Arithmetic -
- - - - - Average packed unsigned 8-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -ENDFOR - - - SSE2 -
emmintrin.h
- Probability/Statistics -
- - - - - Average packed unsigned 16-bit integers in "a" and "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -ENDFOR - - - SSE2 -
emmintrin.h
- Probability/Statistics -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the maximum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [max_float_note] - -dst[63:0] := MAX(a[63:0], b[63:0]) -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed maximum values in "dst". [max_float_note] - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MAX(a[i+63:i], b[i+63:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b", store the minimum value in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". [min_float_note] - -dst[63:0] := MIN(a[63:0], b[63:0]) -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b", and store packed minimum values in "dst". [min_float_note] - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := MIN(a[i+63:i], b[i+63:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Special Math Functions -
- - - - - Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift "a" left by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] << (tmp*8) - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" left by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0) - ELSE - dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in sign bits, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) - ELSE - dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift "a" right by "imm8" bytes while shifting in zeros, and store the results in "dst". - -tmp := imm8[7:0] -IF tmp > 15 - tmp := 16 -FI -dst[127:0] := a[127:0] >> (tmp*8) - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF imm8[7:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 16-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF count[63:0] > 15 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF imm8[7:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 32-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF count[63:0] > 31 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "imm8" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF imm8[7:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Shift packed 64-bit integers in "a" right by "count" while shifting in zeros, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF count[63:0] > 63 - dst[i+63:i] := 0 - ELSE - dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0]) - FI -ENDFOR - - - SSE2 -
emmintrin.h
- Shift -
- - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[127:0] := (a[127:0] AND b[127:0]) - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise NOT of 128 bits (representing integer data) in "a" and then AND with "b", and store the result in "dst". - -dst[127:0] := ((NOT a[127:0]) AND b[127:0]) - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise OR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[127:0] := (a[127:0] OR b[127:0]) - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise XOR of 128 bits (representing integer data) in "a" and "b", and store the result in "dst". - -dst[127:0] := (a[127:0] XOR b[127:0]) - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] AND b[i+63:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in "a" and then AND with "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] OR b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in "a" and "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -ENDFOR - - - SSE2 -
emmintrin.h
- Logical -
- - - - - Compare packed 8-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed 16-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed 32-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed signed 8-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched. - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ( a[i+7:i] < b[i+7:i] ) ? 0xFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed signed 16-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched. - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ( a[i+15:i] < b[i+15:i] ) ? 0xFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed signed 32-bit integers in "a" and "b" for less-than, and store the results in "dst". Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched. - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for equality, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - dst[63:0] := (a[63:0] == NaN OR b[63:0] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (!(a[63:0] < b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (!(a[63:0] <= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (!(a[63:0] > b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := (!(a[63:0] >= b[63:0])) ? 0xFFFFFFFFFFFFFFFF : 0 -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for less-than-or-equal, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for greater-than-or-equal, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if neither is NaN, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" to see if either is NaN, and store the results in "dst". - FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-equal, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (!(a[i+63:i] < b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-less-than-or-equal, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (!(a[i+63:i] <= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (!(a[i+63:i] > b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare packed double-precision (64-bit) floating-point elements in "a" and "b" for not-greater-than-or-equal, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := (!(a[i+63:i] >= b[i+63:i])) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] < b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] <= b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] > b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] >= b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). - RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] == b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] < b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] <= b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] > b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[63:0] != NaN AND b[63:0] != NaN AND a[63:0] >= b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - - Compare the lower double-precision (64-bit) floating-point element in "a" and "b" for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs. - RETURN ( a[63:0] == NaN OR b[63:0] == NaN OR a[63:0] != b[63:0] ) ? 1 : 0 - - - SSE2 -
emmintrin.h
- Compare -
- - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - - Convert the signed 32-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - - Convert the signed 64-bit integer "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_Int64_To_FP64(b[63:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed signed 32-bit integers in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := j*32 - m := j*64 - dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy 32-bit integer "a" to the lower elements of "dst", and zero the upper elements of "dst". - -dst[31:0] := a[31:0] -dst[127:32] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. - -dst[63:0] := a[63:0] -dst[127:64] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy 64-bit integer "a" to the lower element of "dst", and zero the upper element. - -dst[63:0] := a[63:0] -dst[127:64] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy the lower 32-bit integer in "a" to "dst". - -dst[31:0] := a[31:0] - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy the lower 64-bit integer in "a" to "dst". - -dst[63:0] := a[63:0] - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy the lower 64-bit integer in "a" to "dst". - -dst[63:0] := a[63:0] - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed single-precision (32-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k]) -ENDFOR -dst[127:64] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed double-precision (64-bit) floating-point elements, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_Int32(a[63:0]) - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_Int64(a[63:0]) - - - SSE2 -
emmintrin.h
- Convert -
- - - - - Convert the lower double-precision (64-bit) floating-point element in "b" to a single-precision (32-bit) floating-point element, store the result in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := Convert_FP64_To_FP32(b[63:0]) -dst[127:32] := a[127:32] -dst[MAX:128] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Copy the lower double-precision (64-bit) floating-point element of "a" to "dst". - -dst[63:0] := a[63:0] - - - SSE2 -
emmintrin.h
- Convert -
- - - - - Convert the lower single-precision (32-bit) floating-point element in "b" to a double-precision (64-bit) floating-point element, store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := Convert_FP32_To_FP64(b[31:0]) -dst[127:64] := a[127:64] -dst[MAX:128] := 0 - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 32-bit integer with truncation, and store the result in "dst". - -dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0]) - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert the lower double-precision (64-bit) floating-point element in "a" to a 64-bit integer with truncation, and store the result in "dst". - -dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed single-precision (32-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - Convert packed double-precision (64-bit) floating-point elements in "a" to packed 32-bit integers with truncation, and store the results in "dst". - -FOR j := 0 to 1 - i := 32*j - k := 64*j - dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k]) -ENDFOR - - - SSE2 -
emmintrin.h
- Convert -
- - - - - Set packed 64-bit integers in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 - - SSE2 -
emmintrin.h
- Set -
- - - - - Set packed 64-bit integers in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 - - SSE2 -
emmintrin.h
- Set -
- - - - - - - Set packed 32-bit integers in "dst" with the supplied values. - -dst[31:0] := e0 -dst[63:32] := e1 -dst[95:64] := e2 -dst[127:96] := e3 - - SSE2 -
emmintrin.h
- Set -
- - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values. - -dst[15:0] := e0 -dst[31:16] := e1 -dst[47:32] := e2 -dst[63:48] := e3 -dst[79:64] := e4 -dst[95:80] := e5 -dst[111:96] := e6 -dst[127:112] := e7 - - SSE2 -
emmintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values. - -dst[7:0] := e0 -dst[15:8] := e1 -dst[23:16] := e2 -dst[31:24] := e3 -dst[39:32] := e4 -dst[47:40] := e5 -dst[55:48] := e6 -dst[63:56] := e7 -dst[71:64] := e8 -dst[79:72] := e9 -dst[87:80] := e10 -dst[95:88] := e11 -dst[103:96] := e12 -dst[111:104] := e13 -dst[119:112] := e14 -dst[127:120] := e15 - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast 64-bit integer "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast 64-bit integer "a" to all elements of "dst". This intrinsic may generate the "vpbroadcastq". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast 32-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastd". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := a[31:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast 16-bit integer "a" to all all elements of "dst". This intrinsic may generate "vpbroadcastw". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := a[15:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast 8-bit integer "a" to all elements of "dst". This intrinsic may generate "vpbroadcastb". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := a[7:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - - Set packed 64-bit integers in "dst" with the supplied values in reverse order. - -dst[63:0] := e1 -dst[127:64] := e0 - - SSE2 -
emmintrin.h
- Set -
- - - - - - - Set packed 32-bit integers in "dst" with the supplied values in reverse order. - -dst[31:0] := e3 -dst[63:32] := e2 -dst[95:64] := e1 -dst[127:96] := e0 - - SSE2 -
emmintrin.h
- Set -
- - - - - - - - - - - Set packed 16-bit integers in "dst" with the supplied values in reverse order. - -dst[15:0] := e7 -dst[31:16] := e6 -dst[47:32] := e5 -dst[63:48] := e4 -dst[79:64] := e3 -dst[95:80] := e2 -dst[111:96] := e1 -dst[127:112] := e0 - - SSE2 -
emmintrin.h
- Set -
- - - - - - - - - - - - - - - - - - - Set packed 8-bit integers in "dst" with the supplied values in reverse order. - -dst[7:0] := e15 -dst[15:8] := e14 -dst[23:16] := e13 -dst[31:24] := e12 -dst[39:32] := e11 -dst[47:40] := e10 -dst[55:48] := e9 -dst[63:56] := e8 -dst[71:64] := e7 -dst[79:72] := e6 -dst[87:80] := e5 -dst[95:88] := e4 -dst[103:96] := e3 -dst[111:104] := e2 -dst[119:112] := e1 -dst[127:120] := e0 - - SSE2 -
emmintrin.h
- Set -
- - - Return vector of type __m128i with all elements set to zero. - -dst[MAX:0] := 0 - - - SSE2 -
emmintrin.h
- Set -
- - - - Copy double-precision (64-bit) floating-point element "a" to the lower element of "dst", and zero the upper element. - -dst[63:0] := a[63:0] -dst[127:64] := 0 - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - Broadcast double-precision (64-bit) floating-point value "a" to all elements of "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := a[63:0] -ENDFOR - - SSE2 -
emmintrin.h
- Set -
- - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values. - -dst[63:0] := e0 -dst[127:64] := e1 - - SSE2 -
emmintrin.h
- Set -
- - - - - Set packed double-precision (64-bit) floating-point elements in "dst" with the supplied values in reverse order. - -dst[63:0] := e1 -dst[127:64] := e0 - - SSE2 -
emmintrin.h
- Set -
- - - - Return vector of type __m128d with all elements set to zero. - -dst[MAX:0] := 0 - - - SSE2 -
emmintrin.h
- Set -
- - - - Copy the lower 64-bit integer in "a" to "dst". - -dst[63:0] := a[63:0] - - - SSE2 -
emmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using signed saturation, and store the results in "dst". - -dst[7:0] := Saturate8(a[15:0]) -dst[15:8] := Saturate8(a[31:16]) -dst[23:16] := Saturate8(a[47:32]) -dst[31:24] := Saturate8(a[63:48]) -dst[39:32] := Saturate8(a[79:64]) -dst[47:40] := Saturate8(a[95:80]) -dst[55:48] := Saturate8(a[111:96]) -dst[63:56] := Saturate8(a[127:112]) -dst[71:64] := Saturate8(b[15:0]) -dst[79:72] := Saturate8(b[31:16]) -dst[87:80] := Saturate8(b[47:32]) -dst[95:88] := Saturate8(b[63:48]) -dst[103:96] := Saturate8(b[79:64]) -dst[111:104] := Saturate8(b[95:80]) -dst[119:112] := Saturate8(b[111:96]) -dst[127:120] := Saturate8(b[127:112]) - - - SSE2 -
emmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using signed saturation, and store the results in "dst". - -dst[15:0] := Saturate16(a[31:0]) -dst[31:16] := Saturate16(a[63:32]) -dst[47:32] := Saturate16(a[95:64]) -dst[63:48] := Saturate16(a[127:96]) -dst[79:64] := Saturate16(b[31:0]) -dst[95:80] := Saturate16(b[63:32]) -dst[111:96] := Saturate16(b[95:64]) -dst[127:112] := Saturate16(b[127:96]) - - - SSE2 -
emmintrin.h
- Miscellaneous -
- - - - - Convert packed signed 16-bit integers from "a" and "b" to packed 8-bit integers using unsigned saturation, and store the results in "dst". - -dst[7:0] := SaturateU8(a[15:0]) -dst[15:8] := SaturateU8(a[31:16]) -dst[23:16] := SaturateU8(a[47:32]) -dst[31:24] := SaturateU8(a[63:48]) -dst[39:32] := SaturateU8(a[79:64]) -dst[47:40] := SaturateU8(a[95:80]) -dst[55:48] := SaturateU8(a[111:96]) -dst[63:56] := SaturateU8(a[127:112]) -dst[71:64] := SaturateU8(b[15:0]) -dst[79:72] := SaturateU8(b[31:16]) -dst[87:80] := SaturateU8(b[47:32]) -dst[95:88] := SaturateU8(b[63:48]) -dst[103:96] := SaturateU8(b[79:64]) -dst[111:104] := SaturateU8(b[95:80]) -dst[119:112] := SaturateU8(b[111:96]) -dst[127:120] := SaturateU8(b[127:112]) - - - SSE2 -
emmintrin.h
- Miscellaneous -
- - - - Create mask from the most significant bit of each 8-bit element in "a", and store the result in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[j] := a[i+7] -ENDFOR -dst[MAX:16] := 0 - - - SSE2 -
emmintrin.h
- Miscellaneous -
- - - - Set each bit of mask "dst" based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in "a". - -FOR j := 0 to 1 - i := j*64 - IF a[i+63] - dst[j] := 1 - ELSE - dst[j] := 0 - FI -ENDFOR -dst[MAX:2] := 0 - - - SSE2 -
emmintrin.h
- Miscellaneous -
- - - - Copy the 64-bit integer "a" to the lower element of "dst", and zero the upper element. - -dst[63:0] := a[63:0] -dst[127:64] := 0 - - - SSE2 -
emmintrin.h
- Move -
- - - - Copy the lower 64-bit integer in "a" to the lower element of "dst", and zero the upper element. - -dst[63:0] := a[63:0] -dst[127:64] := 0 - - - SSE2 -
emmintrin.h
- Move -
- - - - - Move the lower double-precision (64-bit) floating-point element from "b" to the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := b[63:0] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Move -
- - - - - Extract a 16-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - -dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0] -dst[31:16] := 0 - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 16-bit integer "i" into "dst" at the location specified by "imm8". - -dst[127:0] := a[127:0] -sel := imm8[2:0]*16 -dst[sel+15:sel] := i[15:0] - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Shuffle 32-bit integers in "a" using the control in "imm8", and store the results in "dst". - -DEFINE SELECT4(src, control) { - CASE(control[1:0]) OF - 0: tmp[31:0] := src[31:0] - 1: tmp[31:0] := src[63:32] - 2: tmp[31:0] := src[95:64] - 3: tmp[31:0] := src[127:96] - ESAC - RETURN tmp[31:0] -} -dst[31:0] := SELECT4(a[127:0], imm8[1:0]) -dst[63:32] := SELECT4(a[127:0], imm8[3:2]) -dst[95:64] := SELECT4(a[127:0], imm8[5:4]) -dst[127:96] := SELECT4(a[127:0], imm8[7:6]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Shuffle 16-bit integers in the high 64 bits of "a" using the control in "imm8". Store the results in the high 64 bits of "dst", with the low 64 bits being copied from from "a" to "dst". - -dst[63:0] := a[63:0] -dst[79:64] := (a >> (imm8[1:0] * 16))[79:64] -dst[95:80] := (a >> (imm8[3:2] * 16))[79:64] -dst[111:96] := (a >> (imm8[5:4] * 16))[79:64] -dst[127:112] := (a >> (imm8[7:6] * 16))[79:64] - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Shuffle 16-bit integers in the low 64 bits of "a" using the control in "imm8". Store the results in the low 64 bits of "dst", with the high 64 bits being copied from from "a" to "dst". - -dst[15:0] := (a >> (imm8[1:0] * 16))[15:0] -dst[31:16] := (a >> (imm8[3:2] * 16))[15:0] -dst[47:32] := (a >> (imm8[5:4] * 16))[15:0] -dst[63:48] := (a >> (imm8[7:6] * 16))[15:0] -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[71:64] - dst[15:8] := src2[71:64] - dst[23:16] := src1[79:72] - dst[31:24] := src2[79:72] - dst[39:32] := src1[87:80] - dst[47:40] := src2[87:80] - dst[55:48] := src1[95:88] - dst[63:56] := src2[95:88] - dst[71:64] := src1[103:96] - dst[79:72] := src2[103:96] - dst[87:80] := src1[111:104] - dst[95:88] := src2[111:104] - dst[103:96] := src1[119:112] - dst[111:104] := src2[119:112] - dst[119:112] := src1[127:120] - dst[127:120] := src2[127:120] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[79:64] - dst[31:16] := src2[79:64] - dst[47:32] := src1[95:80] - dst[63:48] := src2[95:80] - dst[79:64] := src1[111:96] - dst[95:80] := src2[111:96] - dst[111:96] := src1[127:112] - dst[127:112] := src2[127:112] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[95:64] - dst[63:32] := src2[95:64] - dst[95:64] := src1[127:96] - dst[127:96] := src2[127:96] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 64-bit integers from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 8-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_BYTES(src1[127:0], src2[127:0]) { - dst[7:0] := src1[7:0] - dst[15:8] := src2[7:0] - dst[23:16] := src1[15:8] - dst[31:24] := src2[15:8] - dst[39:32] := src1[23:16] - dst[47:40] := src2[23:16] - dst[55:48] := src1[31:24] - dst[63:56] := src2[31:24] - dst[71:64] := src1[39:32] - dst[79:72] := src2[39:32] - dst[87:80] := src1[47:40] - dst[95:88] := src2[47:40] - dst[103:96] := src1[55:48] - dst[111:104] := src2[55:48] - dst[119:112] := src1[63:56] - dst[127:120] := src2[63:56] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 16-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_WORDS(src1[127:0], src2[127:0]) { - dst[15:0] := src1[15:0] - dst[31:16] := src2[15:0] - dst[47:32] := src1[31:16] - dst[63:48] := src2[31:16] - dst[79:64] := src1[47:32] - dst[95:80] := src2[47:32] - dst[111:96] := src1[63:48] - dst[127:112] := src2[63:48] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 32-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) { - dst[31:0] := src1[31:0] - dst[63:32] := src2[31:0] - dst[95:64] := src1[63:32] - dst[127:96] := src2[63:32] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave 64-bit integers from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the high half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[127:64] - dst[127:64] := src2[127:64] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Unpack and interleave double-precision (64-bit) floating-point elements from the low half of "a" and "b", and store the results in "dst". - -DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { - dst[63:0] := src1[63:0] - dst[127:64] := src2[63:0] - RETURN dst[127:0] -} -dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - - Shuffle double-precision (64-bit) floating-point elements using the control in "imm8", and store the results in "dst". - -dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] -dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] - - - SSE2 -
emmintrin.h
- Swizzle -
- - - - - Compute the square root of the lower double-precision (64-bit) floating-point element in "b", store the result in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := SQRT(b[63:0]) -dst[127:64] := a[127:64] - - - SSE2 -
emmintrin.h
- Elementary Math Functions -
- - - - Compute the square root of packed double-precision (64-bit) floating-point elements in "a", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SQRT(a[i+63:i]) -ENDFOR - - - SSE2 -
emmintrin.h
- Elementary Math Functions -
- - - - Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - SSE2 -
emmintrin.h
- Cast -
- - - - Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - SSE2 -
emmintrin.h
- Cast -
- - - - Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - SSE2 -
emmintrin.h
- Cast -
- - - - Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - SSE2 -
emmintrin.h
- Cast -
- - - - Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - SSE2 -
emmintrin.h
- Cast -
- - - - Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency. - SSE2 -
emmintrin.h
- Cast -
- - - - - - - Alternatively add and subtract packed single-precision (32-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF ((j & 1) == 0) - dst[i+31:i] := a[i+31:i] - b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] + b[i+31:i] - FI -ENDFOR - - - SSE3 -
pmmintrin.h
- Arithmetic -
- - - - - Alternatively add and subtract packed double-precision (64-bit) floating-point elements in "a" to/from packed elements in "b", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF ((j & 1) == 0) - dst[i+63:i] := a[i+63:i] - b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] + b[i+63:i] - FI -ENDFOR - - - SSE3 -
pmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[63:0] := a[127:64] + a[63:0] -dst[127:64] := b[127:64] + b[63:0] - - - SSE3 -
pmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[31:0] := a[63:32] + a[31:0] -dst[63:32] := a[127:96] + a[95:64] -dst[95:64] := b[63:32] + b[31:0] -dst[127:96] := b[127:96] + b[95:64] - - - SSE3 -
pmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[63:0] := a[63:0] - a[127:64] -dst[127:64] := b[63:0] - b[127:64] - - - SSE3 -
pmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in "a" and "b", and pack the results in "dst". - -dst[31:0] := a[31:0] - a[63:32] -dst[63:32] := a[95:64] - a[127:96] -dst[95:64] := b[31:0] - b[63:32] -dst[127:96] := b[95:64] - b[127:96] - - - SSE3 -
pmmintrin.h
- Arithmetic -
- - - - Load 128-bits of integer data from unaligned memory into "dst". This intrinsic may perform better than "_mm_loadu_si128" when the data crosses a cache line boundary. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE3 -
pmmintrin.h
- Load -
- - - - Load a double-precision (64-bit) floating-point element from memory into both elements of "dst". - -dst[63:0] := MEM[mem_addr+63:mem_addr] -dst[127:64] := MEM[mem_addr+63:mem_addr] - - - SSE3 -
pmmintrin.h
- Load -
- - - - Duplicate the low double-precision (64-bit) floating-point element from "a", and store the results in "dst". - -dst[63:0] := a[63:0] -dst[127:64] := a[63:0] - - - SSE3 -
pmmintrin.h
- Move -
- - - - Duplicate odd-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[63:32] -dst[63:32] := a[63:32] -dst[95:64] := a[127:96] -dst[127:96] := a[127:96] - - - SSE3 -
pmmintrin.h
- Move -
- - - - Duplicate even-indexed single-precision (32-bit) floating-point elements from "a", and store the results in "dst". - -dst[31:0] := a[31:0] -dst[63:32] := a[31:0] -dst[95:64] := a[95:64] -dst[127:96] := a[95:64] - - - SSE3 -
pmmintrin.h
- Move -
- - - - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF imm8[j] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF imm8[j] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Blend packed double-precision (64-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - IF mask[i+63] - dst[i+63:i] := b[i+63:i] - ELSE - dst[i+63:i] := a[i+63:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Blend packed single-precision (32-bit) floating-point elements from "a" and "b" using "mask", and store the results in "dst". - -FOR j := 0 to 3 - i := j*32 - IF mask[i+31] - dst[i+31:i] := b[i+31:i] - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Blend packed 8-bit integers from "a" and "b" using "mask", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - IF mask[i+7] - dst[i+7:i] := b[i+7:i] - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Blend packed 16-bit integers from "a" and "b" using control mask "imm8", and store the results in "dst". - -FOR j := 0 to 7 - i := j*16 - IF imm8[j] - dst[i+15:i] := b[i+15:i] - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - Extract a single-precision (32-bit) floating-point element from "a", selected with "imm8", and store the result in "dst". - -dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - Extract an 8-bit integer from "a", selected with "imm8", and store the result in the lower element of "dst". - -dst[7:0] := (a[127:0] >> (imm8[3:0] * 8))[7:0] -dst[31:8] := 0 - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - Extract a 32-bit integer from "a", selected with "imm8", and store the result in "dst". - -dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0] - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - Extract a 64-bit integer from "a", selected with "imm8", and store the result in "dst". - -dst[63:0] := (a[127:0] >> (imm8[0] * 64))[63:0] - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Copy "a" to "tmp", then insert a single-precision (32-bit) floating-point element from "b" into "tmp" using the control in "imm8". Store "tmp" to "dst" using the mask in "imm8" (elements are zeroed out when the corresponding bit is set). - -tmp2[127:0] := a[127:0] -CASE (imm8[7:6]) OF -0: tmp1[31:0] := b[31:0] -1: tmp1[31:0] := b[63:32] -2: tmp1[31:0] := b[95:64] -3: tmp1[31:0] := b[127:96] -ESAC -CASE (imm8[5:4]) OF -0: tmp2[31:0] := tmp1[31:0] -1: tmp2[63:32] := tmp1[31:0] -2: tmp2[95:64] := tmp1[31:0] -3: tmp2[127:96] := tmp1[31:0] -ESAC -FOR j := 0 to 3 - i := j*32 - IF imm8[j%8] - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := tmp2[i+31:i] - FI -ENDFOR - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the lower 8-bit integer from "i" into "dst" at the location specified by "imm8". - -dst[127:0] := a[127:0] -sel := imm8[3:0]*8 -dst[sel+7:sel] := i[7:0] - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 32-bit integer "i" into "dst" at the location specified by "imm8". - -dst[127:0] := a[127:0] -sel := imm8[1:0]*32 -dst[sel+31:sel] := i[31:0] - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Copy "a" to "dst", and insert the 64-bit integer "i" into "dst" at the location specified by "imm8". - -dst[127:0] := a[127:0] -sel := imm8[0]*64 -dst[sel+63:sel] := i[63:0] - - - SSE4.1 -
smmintrin.h
- Swizzle -
- - - - - - Conditionally multiply the packed double-precision (64-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - -DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { - FOR j := 0 to 1 - i := j*64 - IF imm8[(4+j)%8] - temp[i+63:i] := a[i+63:i] * b[i+63:i] - ELSE - temp[i+63:i] := 0.0 - FI - ENDFOR - - sum[63:0] := temp[127:64] + temp[63:0] - - FOR j := 0 to 1 - i := j*64 - IF imm8[j%8] - tmpdst[i+63:i] := sum[63:0] - ELSE - tmpdst[i+63:i] := 0.0 - FI - ENDFOR - RETURN tmpdst[127:0] -} -dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) - - - SSE4.1 -
smmintrin.h
- Arithmetic -
- - - - - - Conditionally multiply the packed single-precision (32-bit) floating-point elements in "a" and "b" using the high 4 bits in "imm8", sum the four products, and conditionally store the sum in "dst" using the low 4 bits of "imm8". - -DEFINE DP(a[127:0], b[127:0], imm8[7:0]) { - FOR j := 0 to 3 - i := j*32 - IF imm8[(4+j)%8] - temp[i+31:i] := a[i+31:i] * b[i+31:i] - ELSE - temp[i+31:i] := 0 - FI - ENDFOR - - sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0]) - - FOR j := 0 to 3 - i := j*32 - IF imm8[j%8] - tmpdst[i+31:i] := sum[31:0] - ELSE - tmpdst[i+31:i] := 0 - FI - ENDFOR - RETURN tmpdst[127:0] -} -dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0]) - - - SSE4.1 -
smmintrin.h
- Arithmetic -
- - - - - Multiply the low signed 32-bit integers from each packed 64-bit element in "a" and "b", and store the signed 64-bit results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := SignExtend64(a[i+31:i]) * SignExtend64(b[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Arithmetic -
- - - - - Multiply the packed 32-bit integers in "a" and "b", producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in "dst". - -FOR j := 0 to 3 - i := j*32 - tmp[63:0] := a[i+31:i] * b[i+31:i] - dst[i+31:i] := tmp[31:0] -ENDFOR - - - SSE4.1 -
smmintrin.h
- Arithmetic -
- - Miscellaneous - - - - - Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in "a" compared to those in "b", and store the 16-bit results in "dst". - Eight SADs are performed using one quadruplet from "b" and eight quadruplets from "a". One quadruplet is selected from "b" starting at on the offset specified in "imm8". Eight quadruplets are formed from sequential 8-bit integers selected from "a" starting at the offset specified in "imm8". - -DEFINE MPSADBW(a[127:0], b[127:0], imm8[2:0]) { - a_offset := imm8[2]*32 - b_offset := imm8[1:0]*32 - FOR j := 0 to 7 - i := j*8 - k := a_offset+i - l := b_offset - tmp[i*2+15:i*2] := ABS(Signed(a[k+7:k] - b[l+7:l])) + ABS(Signed(a[k+15:k+8] - b[l+15:l+8])) + \ - ABS(Signed(a[k+23:k+16] - b[l+23:l+16])) + ABS(Signed(a[k+31:k+24] - b[l+31:l+24])) - ENDFOR - RETURN tmp[127:0] -} -dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0]) - - - SSE4.1 -
smmintrin.h
- Arithmetic -
- - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MAX(a[i+31:i], b[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed maximum values in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 8-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed signed 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 32-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := MIN(a[i+31:i], b[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Compare packed unsigned 16-bit integers in "a" and "b", and store packed minimum values in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Round the packed double-precision (64-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed double-precision floating-point elements in "dst". - [round_note] - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ROUND(a[i+63:i], rounding) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" down to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := FLOOR(a[i+63:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - Round the packed double-precision (64-bit) floating-point elements in "a" up to an integer value, and store the results as packed double-precision floating-point elements in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := CEIL(a[i+63:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Round the packed single-precision (32-bit) floating-point elements in "a" using the "rounding" parameter, and store the results as packed single-precision floating-point elements in "dst". - [round_note] - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ROUND(a[i+31:i], rounding) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" down to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := FLOOR(a[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - Round the packed single-precision (32-bit) floating-point elements in "a" up to an integer value, and store the results as packed single-precision floating-point elements in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := CEIL(a[i+31:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - - Round the lower double-precision (64-bit) floating-point element in "b" using the "rounding" parameter, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - [round_note] - -dst[63:0] := ROUND(b[63:0], rounding) -dst[127:64] := a[127:64] - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Round the lower double-precision (64-bit) floating-point element in "b" down to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := FLOOR(b[63:0]) -dst[127:64] := a[127:64] - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Round the lower double-precision (64-bit) floating-point element in "b" up to an integer value, store the result as a double-precision floating-point element in the lower element of "dst", and copy the upper element from "a" to the upper element of "dst". - -dst[63:0] := CEIL(b[63:0]) -dst[127:64] := a[127:64] - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - - Round the lower single-precision (32-bit) floating-point element in "b" using the "rounding" parameter, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - [round_note] - -dst[31:0] := ROUND(b[31:0], rounding) -dst[127:32] := a[127:32] - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Round the lower single-precision (32-bit) floating-point element in "b" down to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := FLOOR(b[31:0]) -dst[127:32] := a[127:32] - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - - - - Round the lower single-precision (32-bit) floating-point element in "b" up to an integer value, store the result as a single-precision floating-point element in the lower element of "dst", and copy the upper 3 packed elements from "a" to the upper elements of "dst". - -dst[31:0] := CEIL(b[31:0]) -dst[127:32] := a[127:32] - - - SSE4.1 -
smmintrin.h
- Special Math Functions -
- - Miscellaneous - - - - Convert packed signed 32-bit integers from "a" and "b" to packed 16-bit integers using unsigned saturation, and store the results in "dst". - -dst[15:0] := SaturateU16(a[31:0]) -dst[31:16] := SaturateU16(a[63:32]) -dst[47:32] := SaturateU16(a[95:64]) -dst[63:48] := SaturateU16(a[127:96]) -dst[79:64] := SaturateU16(b[31:0]) -dst[95:80] := SaturateU16(b[63:32]) -dst[111:96] := SaturateU16(b[95:64]) -dst[127:112] := SaturateU16(b[127:96]) - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - l := j*16 - dst[l+15:l] := SignExtend16(a[i+7:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[i+31:i] := SignExtend32(a[k+7:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Sign extend packed 8-bit integers in the low 8 bytes of "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[i+63:i] := SignExtend64(a[k+7:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Sign extend packed 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[i+31:i] := SignExtend32(a[k+15:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Sign extend packed 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[i+63:i] := SignExtend64(a[k+15:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Sign extend packed 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[i+63:i] := SignExtend64(a[k+31:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in "a" to packed 16-bit integers, and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - l := j*16 - dst[l+15:l] := ZeroExtend16(a[i+7:i]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 8*j - dst[i+31:i] := ZeroExtend32(a[k+7:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Zero extend packed unsigned 8-bit integers in the low 8 byte sof "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 8*j - dst[i+63:i] := ZeroExtend64(a[k+7:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Zero extend packed unsigned 16-bit integers in "a" to packed 32-bit integers, and store the results in "dst". - -FOR j := 0 to 3 - i := 32*j - k := 16*j - dst[i+31:i] := ZeroExtend32(a[k+15:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Zero extend packed unsigned 16-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 16*j - dst[i+63:i] := ZeroExtend64(a[k+15:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - Zero extend packed unsigned 32-bit integers in "a" to packed 64-bit integers, and store the results in "dst". - -FOR j := 0 to 1 - i := 64*j - k := 32*j - dst[i+63:i] := ZeroExtend64(a[k+31:k]) -ENDFOR - - - SSE4.1 -
smmintrin.h
- Convert -
- - - - - Compare packed 64-bit integers in "a" and "b" for equality, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE4.1 -
smmintrin.h
- Compare -
- - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "ZF" value. - -IF ((a[127:0] AND b[127:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[127:0]) AND b[127:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -RETURN ZF - - - SSE4.1 -
smmintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return the "CF" value. - -IF ((a[127:0] AND b[127:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[127:0]) AND b[127:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -RETURN CF - - - SSE4.1 -
smmintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "b", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "b", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -IF ((a[127:0] AND b[127:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[127:0]) AND b[127:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - SSE4.1 -
smmintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and return 1 if the result is zero, otherwise return 0. - -IF ((a[127:0] AND mask[127:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -dst := ZF - - - SSE4.1 -
smmintrin.h
- Logical -
- - - - - Compute the bitwise AND of 128 bits (representing integer data) in "a" and "mask", and set "ZF" to 1 if the result is zero, otherwise set "ZF" to 0. Compute the bitwise NOT of "a" and then AND with "mask", and set "CF" to 1 if the result is zero, otherwise set "CF" to 0. Return 1 if both the "ZF" and "CF" values are zero, otherwise return 0. - -IF ((a[127:0] AND mask[127:0]) == 0) - ZF := 1 -ELSE - ZF := 0 -FI -IF (((NOT a[127:0]) AND mask[127:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -IF (ZF == 0 && CF == 0) - dst := 1 -ELSE - dst := 0 -FI - - - SSE4.1 -
smmintrin.h
- Logical -
- - - - Compute the bitwise NOT of "a" and then AND with a 128-bit vector containing all 1's, and return 1 if the result is zero, otherwise return 0. - -FOR j := 0 to 127 - tmp[j] := 1 -ENDFOR -IF (((NOT a[127:0]) AND tmp[127:0]) == 0) - CF := 1 -ELSE - CF := 0 -FI -dst := CF - - - - SSE4.1 -
smmintrin.h
- Logical -
- - - - Horizontally compute the minimum amongst the packed unsigned 16-bit integers in "a", store the minimum and index in "dst", and zero the remaining bits in "dst". - -index[2:0] := 0 -min[15:0] := a[15:0] -FOR j := 0 to 7 - i := j*16 - IF a[i+15:i] < min[15:0] - index[2:0] := j - min[15:0] := a[i+15:i] - FI -ENDFOR -dst[15:0] := min[15:0] -dst[18:16] := index[2:0] -dst[127:19] := 0 - - - SSE4.1 -
smmintrin.h
- Miscellaneous -
- - - - Load 128-bits of integer data from memory into "dst" using a non-temporal memory hint. - "mem_addr" must be aligned on a 16-byte boundary or a general-protection exception may be generated. - -dst[127:0] := MEM[mem_addr+127:mem_addr] - - - SSE4.1 -
smmintrin.h
- Load -
- - - - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated mask in "dst". - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF a[m+size-1:m] == 0 - aInvalid := 1 - FI - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -bInvalid := 0 -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - IF bInvalid // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -IF imm8[6] // byte / word mask - FOR i := 0 to UpperBound - j := i*size - IF IntRes2[i] - dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF) - ELSE - dst[j+size-1:j] := 0 - FI - ENDFOR -ELSE // bit mask - dst[UpperBound:0] := IntRes2[UpperBound:0] - dst[127:UpperBound+1] := 0 -FI - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and store the generated index in "dst". - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF a[m+size-1:m] == 0 - aInvalid := 1 - FI - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -bInvalid := 0 -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - IF bInvalid // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -IF imm8[6] // most significant bit - tmp := UpperBound - dst := tmp - DO WHILE ((tmp >= 0) AND a[tmp] == 0) - tmp := tmp - 1 - dst := tmp - OD -ELSE // least significant bit - tmp := 0 - dst := tmp - DO WHILE ((tmp <= UpperBound) AND a[tmp] == 0) - tmp := tmp + 1 - dst := tmp - OD -FI - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -bInvalid := 0 -FOR j := 0 to UpperBound - n := j*size - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI -ENDFOR -dst := bInvalid - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF a[m+size-1:m] == 0 - aInvalid := 1 - FI - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -bInvalid := 0 -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - IF bInvalid // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -dst := (IntRes2 != 0) - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -aInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - IF a[m+size-1:m] == 0 - aInvalid := 1 - FI -ENDFOR -dst := aInvalid - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns bit 0 of the resulting bit mask. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF a[m+size-1:m] == 0 - aInvalid := 1 - FI - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -bInvalid := 0 -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - IF bInvalid // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -dst := IntRes2[0] - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - Compare packed strings with implicit lengths in "a" and "b" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF a[m+size-1:m] == 0 - aInvalid := 1 - FI - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -bInvalid := 0 -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF b[n+size-1:n] == 0 - bInvalid := 1 - FI - IF bInvalid // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -dst := (IntRes2 == 0) AND bInvalid - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated mask in "dst". - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF i == la - aInvalid := 1 - FI - IF j == lb - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF i >= lb // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -IF imm8[6] // byte / word mask - FOR i := 0 to UpperBound - j := i*size - IF IntRes2[i] - dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF) - ELSE - dst[j+size-1:j] := 0 - FI - ENDFOR -ELSE // bit mask - dst[UpperBound:0] := IntRes2[UpperBound:0] - dst[127:UpperBound+1] := 0 -FI - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and store the generated index in "dst". - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF i == la - aInvalid := 1 - FI - IF j == lb - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF i >= lb // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -IF imm8[6] // most significant bit - tmp := UpperBound - dst := tmp - DO WHILE ((tmp >= 0) AND a[tmp] == 0) - tmp := tmp - 1 - dst := tmp - OD -ELSE // least significant bit - tmp := 0 - dst := tmp - DO WHILE ((tmp <= UpperBound) AND a[tmp] == 0) - tmp := tmp + 1 - dst := tmp - OD -FI - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "b" was null, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -dst := (lb <= UpperBound) - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if the resulting mask was non-zero, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF i == la - aInvalid := 1 - FI - IF j == lb - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF i >= lb // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -dst := (IntRes2 != 0) - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if any character in "a" was null, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -dst := (la <= UpperBound) - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns bit 0 of the resulting bit mask. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF i == la - aInvalid := 1 - FI - IF j == lb - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF i >= lb // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -dst := IntRes2[0] - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - - - - Compare packed strings in "a" and "b" with lengths "la" and "lb" using the control in "imm8", and returns 1 if "b" did not contain a null character and the resulting mask was zero, and 0 otherwise. - [strcmp_note] - -size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters -UpperBound := (128 / size) - 1 -BoolRes := 0 -// compare all characters -aInvalid := 0 -bInvalid := 0 -FOR i := 0 to UpperBound - m := i*size - FOR j := 0 to UpperBound - n := j*size - BoolRes.word[i].bit[j] := (a[m+size-1:m] == b[n+size-1:n]) ? 1 : 0 - - // invalidate characters after EOS - IF i == la - aInvalid := 1 - FI - IF j == lb - bInvalid := 1 - FI - - // override comparisons for invalid characters - CASE (imm8[3:2]) OF - 0: // equal any - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 1: // ranges - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - FI - 2: // equal each - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - 3: // equal ordered - IF (!aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 0 - ELSE IF (aInvalid && !bInvalid) - BoolRes.word[i].bit[j] := 1 - ELSE IF (aInvalid && bInvalid) - BoolRes.word[i].bit[j] := 1 - FI - ESAC - ENDFOR -ENDFOR -// aggregate results -CASE (imm8[3:2]) OF -0: // equal any - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR BoolRes.word[i].bit[j] - ENDFOR - ENDFOR -1: // ranges - IntRes1 := 0 - FOR i := 0 to UpperBound - FOR j := 0 to UpperBound - IntRes1[i] := IntRes1[i] OR (BoolRes.word[i].bit[j] AND BoolRes.word[i].bit[j+1]) - j += 2 - ENDFOR - ENDFOR -2: // equal each - IntRes1 := 0 - FOR i := 0 to UpperBound - IntRes1[i] := BoolRes.word[i].bit[i] - ENDFOR -3: // equal ordered - IntRes1 := (imm8[0] ? 0xFF : 0xFFFF) - FOR i := 0 to UpperBound - k := i - FOR j := 0 to UpperBound-i - IntRes1[i] := IntRes1[i] AND BoolRes.word[k].bit[j] - k := k+1 - ENDFOR - ENDFOR -ESAC -// optionally negate results -FOR i := 0 to UpperBound - IF imm8[4] - IF imm8[5] // only negate valid - IF i >= lb // invalid, don't negate - IntRes2[i] := IntRes1[i] - ELSE // valid, negate - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // negate all - IntRes2[i] := -1 XOR IntRes1[i] - FI - ELSE // don't negate - IntRes2[i] := IntRes1[i] - FI -ENDFOR -// output -dst := (IntRes2 == 0) AND (lb > UpperBound) - - - SSE4.2 -
nmmintrin.h
- String Compare -
- - - - - Compare packed signed 64-bit integers in "a" and "b" for greater-than, and store the results in "dst". - -FOR j := 0 to 1 - i := j*64 - dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0 -ENDFOR - - - SSE4.2 -
nmmintrin.h
- Compare -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 8-bit integer "v", and stores the result in "dst". - tmp1[7:0] := v[0:7] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[39:0] := tmp1[7:0] << 32 -tmp4[39:0] := tmp2[31:0] << 8 -tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0] -tmp6[31:0] := MOD2(tmp5[39:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - SSE4.2 -
nmmintrin.h
- Cryptography -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 16-bit integer "v", and stores the result in "dst". - tmp1[15:0] := v[0:15] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[47:0] := tmp1[15:0] << 32 -tmp4[47:0] := tmp2[31:0] << 16 -tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0] -tmp6[31:0] := MOD2(tmp5[47:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - SSE4.2 -
nmmintrin.h
- Cryptography -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 32-bit integer "v", and stores the result in "dst". - tmp1[31:0] := v[0:31] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[63:0] := tmp1[31:0] << 32 -tmp4[63:0] := tmp2[31:0] << 32 -tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0] -tmp6[31:0] := MOD2(tmp5[63:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - SSE4.2 -
nmmintrin.h
- Cryptography -
- - - - - Starting with the initial value in "crc", accumulates a CRC32 value for unsigned 64-bit integer "v", and stores the result in "dst". - tmp1[63:0] := v[0:63] // bit reflection -tmp2[31:0] := crc[0:31] // bit reflection -tmp3[95:0] := tmp1[31:0] << 32 -tmp4[95:0] := tmp2[63:0] << 64 -tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0] -tmp6[31:0] := MOD2(tmp5[95:0], 0x11EDC6F41) // remainder from polynomial division modulus 2 -dst[31:0] := tmp6[0:31] // bit reflection - - - SSE4.2 -
nmmintrin.h
- Cryptography -
- - - - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 7 - i := j*8 - dst[i+7:i] := ABS(Int(a[i+7:i])) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 8-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 15 - i := j*8 - dst[i+7:i] := ABS(a[i+7:i]) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := ABS(Int(a[i+15:i])) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 16-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := ABS(a[i+15:i]) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 1 - i := j*32 - dst[i+31:i] := ABS(a[i+31:i]) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Special Math Functions -
- - - - Compute the absolute value of packed signed 32-bit integers in "a", and store the unsigned results in "dst". - -FOR j := 0 to 3 - i := j*32 - dst[i+31:i] := ABS(a[i+31:i]) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Special Math Functions -
- - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". - -FOR j := 0 to 15 - i := j*8 - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[3:0] := b[i+3:i] - dst[i+7:i] := a[index*8+7:index*8] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Swizzle -
- - - - - Shuffle packed 8-bit integers in "a" according to shuffle control mask in the corresponding 8-bit element of "b", and store the results in "dst". - -FOR j := 0 to 7 - i := j*8 - IF b[i+7] == 1 - dst[i+7:i] := 0 - ELSE - index[2:0] := b[i+2:i] - dst[i+7:i] := a[index*8+7:index*8] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Swizzle -
- - - - - - Concatenate 16-byte blocks in "a" and "b" into a 32-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - -tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) -dst[127:0] := tmp[127:0] - - - SSSE3 -
tmmintrin.h
- Miscellaneous -
- - - - - - Concatenate 8-byte blocks in "a" and "b" into a 16-byte temporary result, shift the result right by "imm8" bytes, and store the low 16 bytes in "dst". - -tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) -dst[63:0] := tmp[63:0] - - - SSSE3 -
tmmintrin.h
- Miscellaneous -
- - - - - Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - -dst[15:0] := a[31:16] + a[15:0] -dst[31:16] := a[63:48] + a[47:32] -dst[47:32] := a[95:80] + a[79:64] -dst[63:48] := a[127:112] + a[111:96] -dst[79:64] := b[31:16] + b[15:0] -dst[95:80] := b[63:48] + b[47:32] -dst[111:96] := b[95:80] + b[79:64] -dst[127:112] := b[127:112] + b[111:96] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - -dst[15:0] := Saturate16(a[31:16] + a[15:0]) -dst[31:16] := Saturate16(a[63:48] + a[47:32]) -dst[47:32] := Saturate16(a[95:80] + a[79:64]) -dst[63:48] := Saturate16(a[127:112] + a[111:96]) -dst[79:64] := Saturate16(b[31:16] + b[15:0]) -dst[95:80] := Saturate16(b[63:48] + b[47:32]) -dst[111:96] := Saturate16(b[95:80] + b[79:64]) -dst[127:112] := Saturate16(b[127:112] + b[111:96]) - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - -dst[31:0] := a[63:32] + a[31:0] -dst[63:32] := a[127:96] + a[95:64] -dst[95:64] := b[63:32] + b[31:0] -dst[127:96] := b[127:96] + b[95:64] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - -dst[15:0] := a[31:16] + a[15:0] -dst[31:16] := a[63:48] + a[47:32] -dst[47:32] := b[31:16] + b[15:0] -dst[63:48] := b[63:48] + b[47:32] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - -dst[31:0] := a[63:32] + a[31:0] -dst[63:32] := b[63:32] + b[31:0] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally add adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - -dst[15:0] := Saturate16(a[31:16] + a[15:0]) -dst[31:16] := Saturate16(a[63:48] + a[47:32]) -dst[47:32] := Saturate16(b[31:16] + b[15:0]) -dst[63:48] := Saturate16(b[63:48] + b[47:32]) - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - -dst[15:0] := a[15:0] - a[31:16] -dst[31:16] := a[47:32] - a[63:48] -dst[47:32] := a[79:64] - a[95:80] -dst[63:48] := a[111:96] - a[127:112] -dst[79:64] := b[15:0] - b[31:16] -dst[95:80] := b[47:32] - b[63:48] -dst[111:96] := b[79:64] - b[95:80] -dst[127:112] := b[111:96] - b[127:112] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - -dst[15:0] := Saturate16(a[15:0] - a[31:16]) -dst[31:16] := Saturate16(a[47:32] - a[63:48]) -dst[47:32] := Saturate16(a[79:64] - a[95:80]) -dst[63:48] := Saturate16(a[111:96] - a[127:112]) -dst[79:64] := Saturate16(b[15:0] - b[31:16]) -dst[95:80] := Saturate16(b[47:32] - b[63:48]) -dst[111:96] := Saturate16(b[79:64] - b[95:80]) -dst[127:112] := Saturate16(b[111:96] - b[127:112]) - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - -dst[31:0] := a[31:0] - a[63:32] -dst[63:32] := a[95:64] - a[127:96] -dst[95:64] := b[31:0] - b[63:32] -dst[127:96] := b[95:64] - b[127:96] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of 16-bit integers in "a" and "b", and pack the signed 16-bit results in "dst". - -dst[15:0] := a[15:0] - a[31:16] -dst[31:16] := a[47:32] - a[63:48] -dst[47:32] := b[15:0] - b[31:16] -dst[63:48] := b[47:32] - b[63:48] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of 32-bit integers in "a" and "b", and pack the signed 32-bit results in "dst". - -dst[31:0] := a[31:0] - a[63:32] -dst[63:32] := b[31:0] - b[63:32] - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Horizontally subtract adjacent pairs of signed 16-bit integers in "a" and "b" using saturation, and pack the signed 16-bit results in "dst". - -dst[15:0] := Saturate16(a[15:0] - a[31:16]) -dst[31:16] := Saturate16(a[47:32] - a[63:48]) -dst[47:32] := Saturate16(b[15:0] - b[31:16]) -dst[63:48] := Saturate16(b[47:32] - b[63:48]) - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". - -FOR j := 0 to 7 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Vertically multiply each unsigned 8-bit integer from "a" with the corresponding signed 8-bit integer from "b", producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in "dst". - -FOR j := 0 to 3 - i := j*16 - dst[i+15:i] := Saturate16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] ) -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - -FOR j := 0 to 7 - i := j*16 - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Multiply packed signed 16-bit integers in "a" and "b", producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to "dst". - -FOR j := 0 to 3 - i := j*16 - tmp[31:0] := ((SignExtend32(a[i+15:i]) * SignExtend32(b[i+15:i])) >> 14) + 1 - dst[i+15:i] := tmp[16:1] -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 15 - i := j*8 - IF b[i+7:i] < 0 - dst[i+7:i] := -(a[i+7:i]) - ELSE IF b[i+7:i] == 0 - dst[i+7:i] := 0 - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 7 - i := j*16 - IF b[i+15:i] < 0 - dst[i+15:i] := -(a[i+15:i]) - ELSE IF b[i+15:i] == 0 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 3 - i := j*32 - IF b[i+31:i] < 0 - dst[i+31:i] := -(a[i+31:i]) - ELSE IF b[i+31:i] == 0 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Negate packed 8-bit integers in "a" when the corresponding signed 8-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 7 - i := j*8 - IF b[i+7:i] < 0 - dst[i+7:i] := -(a[i+7:i]) - ELSE IF b[i+7:i] == 0 - dst[i+7:i] := 0 - ELSE - dst[i+7:i] := a[i+7:i] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Negate packed 16-bit integers in "a" when the corresponding signed 16-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 3 - i := j*16 - IF b[i+15:i] < 0 - dst[i+15:i] := -(a[i+15:i]) - ELSE IF b[i+15:i] == 0 - dst[i+15:i] := 0 - ELSE - dst[i+15:i] := a[i+15:i] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - Negate packed 32-bit integers in "a" when the corresponding signed 32-bit integer in "b" is negative, and store the results in "dst". Element in "dst" are zeroed out when the corresponding element in "b" is zero. - -FOR j := 0 to 1 - i := j*32 - IF b[i+31:i] < 0 - dst[i+31:i] := -(a[i+31:i]) - ELSE IF b[i+31:i] == 0 - dst[i+31:i] := 0 - ELSE - dst[i+31:i] := a[i+31:i] - FI -ENDFOR - - - SSSE3 -
tmmintrin.h
- Arithmetic -
- - - - - - Copy the current 64-bit value of the processor's time-stamp counter into "dst". - dst[63:0] := TimeStampCounter - - - TSC -
immintrin.h
- General Support -
- - - - - Mark the start of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a transactional region, subsequent loads are not added to the read set of the transaction. If this is used inside a suspend load address tracking region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP. - - TSXLDTRK -
immintrin.h
- Miscellaneous -
- - - Mark the end of a TSX (HLE/RTM) suspend load address tracking region. If this is used inside a suspend load address tracking region it will end the suspend region and all following load addresses will be added to the transaction read set. If this is used inside an active transaction but not in a suspend region it will cause transaction abort. If this is used outside of a transactional region it behaves like a NOP. - - TSXLDTRK -
immintrin.h
- Miscellaneous -
- - - - - - Clear the user interrupt flag (UIF). - - UINTR -
immintrin.h
- General Support -
- - - - Send user interprocessor interrupts specified in unsigned 64-bit integer "__a". - - UINTR -
immintrin.h
- General Support -
- - - - Sets the user interrupt flag (UIF). - - UINTR -
immintrin.h
- General Support -
- - - - Store the current user interrupt flag (UIF) in unsigned 8-bit integer "dst". - - UINTR -
immintrin.h
- General Support -
- - - - - Reads the contents of a 64-bit MSR specified in "__A" into "dst". - DEST := MSR[__A] - - - USER_MSR -
x86gprintrin.h
- General Support -
- - - - - Writes the contents of "__B" into the 64-bit MSR specified in "__A". - MSR[__A] := __B - - - USER_MSR -
x86gprintrin.h
- General Support -
- - - - - Perform the last round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." - FOR j := 0 to 1 - i := j*128 - a[i+127:i] := ShiftRows(a[i+127:i]) - a[i+127:i] := SubBytes(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:256] := 0 - - - VAES - AVX512VL -
immintrin.h
- Cryptography -
- - - - - Perform one round of an AES encryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst"." - FOR j := 0 to 1 - i := j*128 - a[i+127:i] := ShiftRows(a[i+127:i]) - a[i+127:i] := SubBytes(a[i+127:i]) - a[i+127:i] := MixColumns(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:256] := 0 - - - VAES - AVX512VL -
immintrin.h
- Cryptography -
- - - - - Perform the last round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". - FOR j := 0 to 1 - i := j*128 - a[i+127:i] := InvShiftRows(a[i+127:i]) - a[i+127:i] := InvSubBytes(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:256] := 0 - - - VAES - AVX512VL -
immintrin.h
- Cryptography -
- - - - - Perform one round of an AES decryption flow on data (state) in "a" using the round key in "RoundKey", and store the results in "dst". - FOR j := 0 to 1 - i := j*128 - a[i+127:i] := InvShiftRows(a[i+127:i]) - a[i+127:i] := InvSubBytes(a[i+127:i]) - a[i+127:i] := InvMixColumns(a[i+127:i]) - dst[i+127:i] := a[i+127:i] XOR RoundKey[i+127:i] -ENDFOR -dst[MAX:256] := 0 - - - VAES - AVX512VL -
immintrin.h
- Cryptography -
- - - - - - - - Carry-less multiplication of one quadword of - 'b' by one quadword of 'c', stores - the 128-bit result in 'dst'. The immediate 'Imm8' is - used to determine which quadwords of 'b' - and 'c' should be used. - -DEFINE PCLMUL128(X,Y) { - FOR i := 0 to 63 - TMP[i] := X[ 0 ] and Y[ i ] - FOR j := 1 to i - TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) - ENDFOR - DEST[ i ] := TMP[ i ] - ENDFOR - FOR i := 64 to 126 - TMP[i] := 0 - FOR j := i - 63 to 63 - TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) - ENDFOR - DEST[ i ] := TMP[ i ] - ENDFOR - DEST[127] := 0 - RETURN DEST // 128b vector -} -FOR i := 0 to 1 - IF Imm8[0] == 0 - TEMP1 := b.m128[i].qword[0] - ELSE - TEMP1 := b.m128[i].qword[1] - FI - IF Imm8[4] == 0 - TEMP2 := c.m128[i].qword[0] - ELSE - TEMP2 := c.m128[i].qword[1] - FI - dst.m128[i] := PCLMUL128(TEMP1, TEMP2) -ENDFOR -dst[MAX:256] := 0 - - - VPCLMULQDQ - AVX512VL -
immintrin.h
- Application-Targeted -
- - - - - - - - Carry-less multiplication of one quadword of - 'b' by one quadword of 'c', stores - the 128-bit result in 'dst'. The immediate 'Imm8' is - used to determine which quadwords of 'b' - and 'c' should be used. - -DEFINE PCLMUL128(X,Y) { - FOR i := 0 to 63 - TMP[i] := X[ 0 ] and Y[ i ] - FOR j := 1 to i - TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) - ENDFOR - DEST[ i ] := TMP[ i ] - ENDFOR - FOR i := 64 to 126 - TMP[i] := 0 - FOR j := i - 63 to 63 - TMP[i] := TMP[i] xor (X[ j ] and Y[ i - j ]) - ENDFOR - DEST[ i ] := TMP[ i ] - ENDFOR - DEST[127] := 0 - RETURN DEST // 128b vector -} -FOR i := 0 to 3 - IF Imm8[0] == 0 - TEMP1 := b.m128[i].qword[0] - ELSE - TEMP1 := b.m128[i].qword[1] - FI - IF Imm8[4] == 0 - TEMP2 := c.m128[i].qword[0] - ELSE - TEMP2 := c.m128[i].qword[1] - FI - dst.m128[i] := PCLMUL128(TEMP1, TEMP2) -ENDFOR -dst[MAX:512] := 0 - - - VPCLMULQDQ -
immintrin.h
- Application-Targeted -
- - - - - - - Directs the processor to enter an implementation-dependent optimized state until the TSC reaches or exceeds the value specified in "counter". Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared. - - WAITPKG -
immintrin.h
- Miscellaneous -
- - - - - Directs the processor to enter an implementation-dependent optimized state while monitoring a range of addresses. The instruction wakes up when the TSC reaches or exceeds the value specified in "counter" (if the monitoring hardware did not trigger beforehand). Bit 0 of "ctrl" selects between a lower power (cleared) or faster wakeup (set) optimized state. Returns the carry flag (CF). If the processor that executed a UMWAIT instruction wakes due to the expiration of the operating system timelimit, the instructions sets RFLAGS.CF; otherwise, that flag is cleared. - - WAITPKG -
immintrin.h
- Miscellaneous -
- - - - Sets up a linear address range to be - monitored by hardware and activates the - monitor. The address range should be a writeback - memory caching type. The address is - contained in "a". - - WAITPKG -
immintrin.h
- Miscellaneous -
- - - - - - Write back and do not flush internal caches. - Initiate writing-back without flushing of external - caches. - - WBNOINVD -
immintrin.h
- Miscellaneous -
- - - - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE - XSAVEC -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE - XSAVEC -
immintrin.h
- OS-Targeted -
- - - - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - 2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE - XSAVEOPT -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - 2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE - XSAVEOPT -
immintrin.h
- OS-Targeted -
- - - - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE - XSS -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr"; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE - XSS -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. - st_mask := mem_addr.HEADER.XSTATE_BV[62:0] -FOR i := 0 to 62 - IF (rs_mask[i] AND XCR0[i]) - IF st_mask[i] - CASE (i) OF - 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] - 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] - DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] - ESAC - ELSE - // ProcessorExtendedState := Processor Supplied Values - CASE (i) OF - 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] - ESAC - FI - FI - i := i + 1 -ENDFOR - - - XSAVE - XSS -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. - st_mask := mem_addr.HEADER.XSTATE_BV[62:0] -FOR i := 0 to 62 - IF (rs_mask[i] AND XCR0[i]) - IF st_mask[i] - CASE (i) OF - 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] - 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] - DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] - ESAC - ELSE - // ProcessorExtendedState := Processor Supplied Values - CASE (i) OF - 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] - ESAC - FI - FI - i := i + 1 -ENDFOR - - - XSAVE - XSS -
immintrin.h
- OS-Targeted -
- - - - - - Copy up to 64-bits from the value of the extended control register (XCR) specified by "a" into "dst". Currently only XFEATURE_ENABLED_MASK XCR is supported. - dst[63:0] := XCR[a] - - - XSAVE -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. - st_mask := mem_addr.HEADER.XSTATE_BV[62:0] -FOR i := 0 to 62 - IF (rs_mask[i] AND XCR0[i]) - IF st_mask[i] - CASE (i) OF - 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] - 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] - DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] - ESAC - ELSE - // ProcessorExtendedState := Processor Supplied Values - CASE (i) OF - 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] - ESAC - FI - FI - i := i + 1 -ENDFOR - - - XSAVE -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial restore of the enabled processor states using the state information stored in memory at "mem_addr". State is restored based on bits [62:0] in "rs_mask", "XCR0", and "mem_addr.HEADER.XSTATE_BV". "mem_addr" must be aligned on a 64-byte boundary. - st_mask := mem_addr.HEADER.XSTATE_BV[62:0] -FOR i := 0 to 62 - IF (rs_mask[i] AND XCR0[i]) - IF st_mask[i] - CASE (i) OF - 0: ProcessorState[x87_FPU] := mem_addr.FPUSSESave_Area[FPU] - 1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE] - DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i] - ESAC - ELSE - // ProcessorExtendedState := Processor Supplied Values - CASE (i) OF - 1: MXCSR := mem_addr.FPUSSESave_Area[SSE] - ESAC - FI - FI - i := i + 1 -ENDFOR - - - XSAVE -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE -
immintrin.h
- OS-Targeted -
- - - - - Perform a full or partial save of the enabled processor states to memory at "mem_addr". State is saved based on bits [62:0] in "save_mask" and "XCR0". "mem_addr" must be aligned on a 64-byte boundary. - mask[62:0] := save_mask[62:0] AND XCR0[62:0] -FOR i := 0 to 62 - IF mask[i] - CASE (i) OF - 0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87_FPU] - 1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE] - DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i] - ESAC - mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i] - FI - i := i + 1 -ENDFOR - - - XSAVE -
immintrin.h
- OS-Targeted -
- - - - - Copy 64-bits from "val" to the extended control register (XCR) specified by "a". Currently only XFEATURE_ENABLED_MASK XCR is supported. - -XCR[a] := val[63:0] - - - XSAVE -
immintrin.h
- OS-Targeted -
- - -
\ No newline at end of file From 11375b9eab3beafc217c7a642f7a2924bb379fd6 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 15 Oct 2025 14:33:21 +0530 Subject: [PATCH 67/73] chore: move from random testing to testing only the first N intrinsics --- Cargo.lock | 77 +++------------------------- crates/intrinsic-test/Cargo.toml | 1 - crates/intrinsic-test/src/x86/mod.rs | 10 +--- 3 files changed, 9 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e198e14ffe..70f09adf2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -282,18 +282,6 @@ dependencies = [ "wasi", ] -[[package]] -name = "getrandom" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" -dependencies = [ - "cfg-if", - "libc", - "r-efi", - "wasip2", -] - [[package]] name = "hashbrown" version = "0.12.3" @@ -360,7 +348,6 @@ dependencies = [ "log", "pretty_env_logger", "quick-xml 0.37.5", - "rand 0.9.2", "rayon", "regex", "serde", @@ -486,7 +473,7 @@ checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" dependencies = [ "env_logger 0.8.4", "log", - "rand 0.8.5", + "rand", ] [[package]] @@ -498,12 +485,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - [[package]] name = "rand" version = "0.8.5" @@ -511,18 +492,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha 0.9.0", - "rand_core 0.9.3", + "rand_chacha", + "rand_core", ] [[package]] @@ -532,17 +503,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.4", -] - -[[package]] -name = "rand_chacha" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" -dependencies = [ - "ppv-lite86", - "rand_core 0.9.3", + "rand_core", ] [[package]] @@ -551,16 +512,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.16", -] - -[[package]] -name = "rand_core" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" -dependencies = [ - "getrandom 0.3.4", + "getrandom", ] [[package]] @@ -751,7 +703,7 @@ dependencies = [ name = "stdarch-gen-loongarch" version = "0.1.0" dependencies = [ - "rand 0.8.5", + "rand", ] [[package]] @@ -784,7 +736,7 @@ version = "0.0.0" dependencies = [ "core_arch", "quickcheck", - "rand 0.8.5", + "rand", ] [[package]] @@ -867,15 +819,6 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" -[[package]] -name = "wasip2" -version = "1.0.1+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" -dependencies = [ - "wit-bindgen", -] - [[package]] name = "wasmparser" version = "0.235.0" @@ -1060,12 +1003,6 @@ version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" -[[package]] -name = "wit-bindgen" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" - [[package]] name = "xml-rs" version = "0.8.27" diff --git a/crates/intrinsic-test/Cargo.toml b/crates/intrinsic-test/Cargo.toml index 9fb70f32f8..2c0f53897e 100644 --- a/crates/intrinsic-test/Cargo.toml +++ b/crates/intrinsic-test/Cargo.toml @@ -22,4 +22,3 @@ itertools = "0.14.0" quick-xml = { version = "0.37.5", features = ["serialize", "overlapped-lists"] } serde-xml-rs = "0.8.0" regex = "1.11.1" -rand = "0.9.2" diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index ca5748e5fb..a28c8647fe 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -12,8 +12,6 @@ use crate::common::intrinsic::Intrinsic; use crate::common::intrinsic_helpers::TypeKind; use intrinsic::X86IntrinsicType; use itertools::Itertools; -use rand::rng; -use rand::seq::IndexedRandom; use xml_parser::get_xml_intrinsics; pub struct X86ArchitectureTest { @@ -49,10 +47,9 @@ impl SupportedArchitectureTest for X86ArchitectureTest { let intrinsics = get_xml_intrinsics(&cli_options.filename).expect("Error parsing input file"); - let mut rng = rng(); let sample_percentage: usize = cli_options.sample_percentage as usize; - let intrinsics = intrinsics + let mut intrinsics = intrinsics .into_iter() // Not sure how we would compare intrinsic that returns void. .filter(|i| i.results.kind() != TypeKind::Void) @@ -68,10 +65,7 @@ impl SupportedArchitectureTest for X86ArchitectureTest { .collect::>(); let sample_size = (intrinsics.len() * sample_percentage) / 100; - let mut intrinsics = intrinsics - .choose_multiple(&mut rng, sample_size) - .cloned() - .collect::>(); + intrinsics.truncate(sample_size); intrinsics.sort_by(|a, b| a.name.cmp(&b.name)); Self { From 2ecda3bdc1773d88a3b3e4f56ec973c4eadb6ca8 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 15 Oct 2025 15:04:04 +0530 Subject: [PATCH 68/73] chore: convert println! logging to trace! logging during compilation step --- crates/intrinsic-test/src/common/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/intrinsic-test/src/common/mod.rs b/crates/intrinsic-test/src/common/mod.rs index 86a7876807..d8f06ae238 100644 --- a/crates/intrinsic-test/src/common/mod.rs +++ b/crates/intrinsic-test/src/common/mod.rs @@ -76,12 +76,12 @@ pub trait SupportedArchitectureTest { // // This is done because `cpp_compiler_wrapped` is None when // the --generate-only flag is passed - println!("compiling mod_{i}.cpp"); + trace!("compiling mod_{i}.cpp"); if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { let compile_output = cpp_compiler .compile_object_file(&format!("mod_{i}.cpp"), &format!("mod_{i}.o")); - println!("finished compiling mod_{i}.cpp"); + trace!("finished compiling mod_{i}.cpp"); if let Err(compile_error) = compile_output { return Err(format!("Error compiling mod_{i}.cpp: {compile_error:?}")); } @@ -104,7 +104,7 @@ pub trait SupportedArchitectureTest { // the --generate-only flag is passed if let Some(cpp_compiler) = cpp_compiler_wrapped.as_ref() { // compile this cpp file into a .o file - info!("compiling main.cpp"); + trace!("compiling main.cpp"); let output = cpp_compiler .compile_object_file("main.cpp", "intrinsic-test-programs.o") .unwrap(); From 106b510c767b70e610d4175ee180eabe6e1f5ebb Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Wed, 15 Oct 2025 20:54:11 +0530 Subject: [PATCH 69/73] feat: code cleanup 1. changing array bracket prefixes from &'static str to char 2. including variable names in template strings instead of passing them as arguments to macros --- crates/intrinsic-test/src/arm/types.rs | 14 +++++----- crates/intrinsic-test/src/common/argument.rs | 2 +- .../src/common/intrinsic_helpers.rs | 28 +++++++++---------- crates/intrinsic-test/src/x86/config.rs | 2 +- crates/intrinsic-test/src/x86/types.rs | 4 +-- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/crates/intrinsic-test/src/arm/types.rs b/crates/intrinsic-test/src/arm/types.rs index c798cbe42d..4be8d1e48b 100644 --- a/crates/intrinsic-test/src/arm/types.rs +++ b/crates/intrinsic-test/src/arm/types.rs @@ -14,10 +14,10 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { (None, None) => format!("{const_prefix}{prefix}{bit_len}_t"), (Some(simd), None) => format!("{prefix}{bit_len}x{simd}_t"), (Some(simd), Some(vec)) => format!("{prefix}{bit_len}x{simd}x{vec}_t"), - (None, Some(_)) => todo!("{:#?}", self), // Likely an invalid case + (None, Some(_)) => todo!("{self:#?}"), // Likely an invalid case } } else { - todo!("{:#?}", self) + todo!("{self:#?}") } } @@ -58,14 +58,14 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { // The ACLE doesn't support 64-bit polynomial loads on Armv7 // if armv7 and bl == 64, use "s", else "p" TypeKind::Poly => if choose_workaround && *bl == 64 {"s"} else {"p"}, - x => todo!("get_load_function TypeKind: {:#?}", x), + x => todo!("get_load_function TypeKind: {x:#?}"), }, size = bl, quad = quad, len = vec_len.unwrap_or(1), ) } else { - todo!("get_load_function IntrinsicType: {:#?}", self) + todo!("get_load_function IntrinsicType: {self:#?}") } } @@ -90,13 +90,13 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { TypeKind::Int(Sign::Signed) => "s", TypeKind::Float => "f", TypeKind::Poly => "p", - x => todo!("get_load_function TypeKind: {:#?}", x), + x => todo!("get_load_function TypeKind: {x:#?}"), }, size = bl, quad = quad, ) } else { - todo!("get_lane_function IntrinsicType: {:#?}", self) + todo!("get_lane_function IntrinsicType: {self:#?}") } } @@ -143,7 +143,7 @@ impl IntrinsicTypeDefinition for ArmIntrinsicType { TypeKind::Int(Sign::Signed) => format!("int{}_t", self.inner_size()), TypeKind::Int(Sign::Unsigned) => format!("uint{}_t", self.inner_size()), TypeKind::Poly => format!("poly{}_t", self.inner_size()), - ty => todo!("print_result_c - Unknown type: {:#?}", ty), + ty => todo!("print_result_c - Unknown type: {ty:#?}"), }, promote = self.generate_final_type_cast(), ) diff --git a/crates/intrinsic-test/src/common/argument.rs b/crates/intrinsic-test/src/common/argument.rs index 5963abef2f..5fb7d0f210 100644 --- a/crates/intrinsic-test/src/common/argument.rs +++ b/crates/intrinsic-test/src/common/argument.rs @@ -31,7 +31,7 @@ where pub fn to_c_type(&self) -> String { let prefix = if self.ty.constant { "const " } else { "" }; - format!("{}{}", prefix, self.ty.c_type()) + format!("{prefix}{}", self.ty.c_type()) } pub fn generate_name(&self) -> String { diff --git a/crates/intrinsic-test/src/common/intrinsic_helpers.rs b/crates/intrinsic-test/src/common/intrinsic_helpers.rs index aa8613206e..c2d66868ce 100644 --- a/crates/intrinsic-test/src/common/intrinsic_helpers.rs +++ b/crates/intrinsic-test/src/common/intrinsic_helpers.rs @@ -80,7 +80,7 @@ impl TypeKind { Self::Poly => "poly", Self::Char(Sign::Signed) => "char", Self::Vector => "int", - _ => unreachable!("Not used: {:#?}", self), + _ => unreachable!("Not used: {self:#?}"), } } @@ -94,7 +94,7 @@ impl TypeKind { Self::Poly => "u", Self::Char(Sign::Unsigned) => "u", Self::Char(Sign::Signed) => "i", - _ => unreachable!("Unused type kind: {:#?}", self), + _ => unreachable!("Unused type kind: {self:#?}"), } } } @@ -134,7 +134,7 @@ impl IntrinsicType { if let Some(bl) = self.bit_len { cmp::max(bl, 8) } else { - unreachable!("{:#?}", self) + unreachable!("{self:#?}") } } @@ -225,8 +225,8 @@ impl IntrinsicType { .. } => { let (prefix, suffix) = match language { - Language::Rust => ("[", "]"), - Language::C => ("{", "}"), + Language::Rust => ('[', ']'), + Language::C => ('{', '}'), }; let body_indentation = indentation.nested(); format!( @@ -262,12 +262,12 @@ impl IntrinsicType { .. } => { let (prefix, cast_prefix, cast_suffix, suffix) = match (language, bit_len) { - (&Language::Rust, 16) => ("[", "f16::from_bits(", ")", "]"), - (&Language::Rust, 32) => ("[", "f32::from_bits(", ")", "]"), - (&Language::Rust, 64) => ("[", "f64::from_bits(", ")", "]"), - (&Language::C, 16) => ("{", "cast(", ")", "}"), - (&Language::C, 32) => ("{", "cast(", ")", "}"), - (&Language::C, 64) => ("{", "cast(", ")", "}"), + (&Language::Rust, 16) => ('[', "f16::from_bits(", ")", ']'), + (&Language::Rust, 32) => ('[', "f32::from_bits(", ")", ']'), + (&Language::Rust, 64) => ('[', "f64::from_bits(", ")", ']'), + (&Language::C, 16) => ('{', "cast(", ")", '}'), + (&Language::C, 32) => ('{', "cast(", ")", '}'), + (&Language::C, 64) => ('{', "cast(", ")", '}'), _ => unreachable!(), }; format!( @@ -288,8 +288,8 @@ impl IntrinsicType { .. } => { let (prefix, suffix) = match language { - Language::Rust => ("[", "]"), - Language::C => ("{", "}"), + Language::Rust => ('[', ']'), + Language::C => ('{', '}'), }; let body_indentation = indentation.nested(); let effective_bit_len = 32; @@ -317,7 +317,7 @@ impl IntrinsicType { }) ) } - _ => unimplemented!("populate random: {:#?}", self), + _ => unimplemented!("populate random: {self:#?}"), } } diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 6be3f1b133..6d913acca7 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -213,7 +213,7 @@ trait DebugAs { impl DebugAs for T { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - write!(f, "{}", self) + write!(f, "{self}") } } diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index cdfc6bfa98..be15b6dccd 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -219,11 +219,11 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { TypeKind::Float if self.inner_size() == 32 => "float".to_string(), TypeKind::Mask => format!( "__mmask{}", - self.bit_len.expect(format!("self: {:#?}", self).as_str()) + self.bit_len.expect(format!("self: {self:#?}").as_str()) ), TypeKind::Vector => format!( "__m{}i", - self.bit_len.expect(format!("self: {:#?}", self).as_str()) + self.bit_len.expect(format!("self: {self:#?}").as_str()) ), _ => self.c_scalar_type(), }, From 19a6292a44aad22987e30c0d999e4bbfdcabb2bf Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 16 Oct 2025 09:47:17 +0530 Subject: [PATCH 70/73] chore: make names in config.rs files uniform across architectures --- crates/intrinsic-test/src/arm/config.rs | 8 ++++---- crates/intrinsic-test/src/arm/mod.rs | 8 ++++---- crates/intrinsic-test/src/x86/config.rs | 4 ++-- crates/intrinsic-test/src/x86/mod.rs | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/intrinsic-test/src/arm/config.rs b/crates/intrinsic-test/src/arm/config.rs index e2bc501127..a634645969 100644 --- a/crates/intrinsic-test/src/arm/config.rs +++ b/crates/intrinsic-test/src/arm/config.rs @@ -3,7 +3,7 @@ pub const NOTICE: &str = "\ // test are derived from a JSON specification, published under the same license as the // `intrinsic-test` crate.\n"; -pub const POLY128_OSTREAM_DECL: &str = r#" +pub const PLATFORM_C_FORWARD_DECLARATIONS: &str = r#" #ifdef __aarch64__ std::ostream& operator<<(std::ostream& os, poly128_t value); #endif @@ -20,7 +20,7 @@ template T1 cast(T2 x) { } "#; -pub const POLY128_OSTREAM_DEF: &str = r#" +pub const PLATFORM_C_DEFINITIONS: &str = r#" #ifdef __aarch64__ std::ostream& operator<<(std::ostream& os, poly128_t value) { std::stringstream temp; @@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& os, uint8_t value) { "#; // Format f16 values (and vectors containing them) in a way that is consistent with C. -pub const F16_FORMATTING_DEF: &str = r#" +pub const PLATFORM_RUST_DEFINITIONS: &str = r#" /// Used to continue `Debug`ging SIMD types as `MySimd(1, 2, 3, 4)`, as they /// were before moving to array-based simd. #[inline] @@ -139,7 +139,7 @@ impl DebugHexF16 for float16x8x4_t { } "#; -pub const AARCH_CONFIGURATIONS: &str = r#" +pub const PLATFORM_RUST_CFGS: &str = r#" #![cfg_attr(target_arch = "arm", feature(stdarch_arm_neon_intrinsics))] #![cfg_attr(target_arch = "arm", feature(stdarch_aarch32_crc32))] #![cfg_attr(any(target_arch = "aarch64", target_arch = "arm64ec"), feature(stdarch_neon_fcma))] diff --git a/crates/intrinsic-test/src/arm/mod.rs b/crates/intrinsic-test/src/arm/mod.rs index 08dc2d3870..7fa5062e86 100644 --- a/crates/intrinsic-test/src/arm/mod.rs +++ b/crates/intrinsic-test/src/arm/mod.rs @@ -32,11 +32,11 @@ impl SupportedArchitectureTest for ArmArchitectureTest { const NOTICE: &str = config::NOTICE; const PLATFORM_C_HEADERS: &[&str] = &["arm_neon.h", "arm_acle.h", "arm_fp16.h"]; - const PLATFORM_C_DEFINITIONS: &str = config::POLY128_OSTREAM_DEF; - const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::POLY128_OSTREAM_DECL; + const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS; + const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS; - const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF; - const PLATFORM_RUST_CFGS: &str = config::AARCH_CONFIGURATIONS; + const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS; + const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS; fn cpp_compilation(&self) -> Option { compile::build_cpp_compilation(&self.cli_options) diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index 6d913acca7..d7770189eb 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -4,7 +4,7 @@ pub const NOTICE: &str = "\ // `intrinsic-test` crate.\n"; // Format f16 values (and vectors containing them) in a way that is consistent with C. -pub const F16_FORMATTING_DEF: &str = r#" +pub const PLATFORM_RUST_DEFINITIONS: &str = r#" use std::arch::x86_64::*; #[inline] @@ -392,7 +392,7 @@ std::ostream& operator<<(std::ostream& os, __mmask8 value) { } "#; -pub const X86_CONFIGURATIONS: &str = r#" +pub const PLATFORM_RUST_CFGS: &str = r#" #![cfg_attr(target_arch = "x86", feature(avx))] #![cfg_attr(target_arch = "x86", feature(sse))] #![cfg_attr(target_arch = "x86", feature(sse2))] diff --git a/crates/intrinsic-test/src/x86/mod.rs b/crates/intrinsic-test/src/x86/mod.rs index a28c8647fe..956e51836f 100644 --- a/crates/intrinsic-test/src/x86/mod.rs +++ b/crates/intrinsic-test/src/x86/mod.rs @@ -40,8 +40,8 @@ impl SupportedArchitectureTest for X86ArchitectureTest { const PLATFORM_C_DEFINITIONS: &str = config::PLATFORM_C_DEFINITIONS; const PLATFORM_C_FORWARD_DECLARATIONS: &str = config::PLATFORM_C_FORWARD_DECLARATIONS; - const PLATFORM_RUST_DEFINITIONS: &str = config::F16_FORMATTING_DEF; - const PLATFORM_RUST_CFGS: &str = config::X86_CONFIGURATIONS; + const PLATFORM_RUST_DEFINITIONS: &str = config::PLATFORM_RUST_DEFINITIONS; + const PLATFORM_RUST_CFGS: &str = config::PLATFORM_RUST_CFGS; fn create(cli_options: ProcessedCli) -> Self { let intrinsics = From a80eff1a38b62c4f120e45c64e4cd7925fabfb9a Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Fri, 17 Oct 2025 22:26:59 +0530 Subject: [PATCH 71/73] fix: remove the PATH update in ci/run.sh --- ci/run.sh | 2 -- crates/intrinsic-test/src/common/compare.rs | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index bd0e06687f..48dfe2a77d 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -96,8 +96,6 @@ case ${TARGET} in TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt TEST_SAMPLE_INTRINSICS_PERCENTAGE=5 export STDARCH_DISABLE_ASSERT_INSTR=1 - PATH="$PATH":"$(pwd)"/c_programs - export PATH export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx" cargo_test "${PROFILE}" diff --git a/crates/intrinsic-test/src/common/compare.rs b/crates/intrinsic-test/src/common/compare.rs index 89e5f965bc..902df94283 100644 --- a/crates/intrinsic-test/src/common/compare.rs +++ b/crates/intrinsic-test/src/common/compare.rs @@ -15,13 +15,13 @@ pub fn compare_outputs(intrinsic_name_list: &Vec, runner: &str, target: .par_iter() .filter_map(|intrinsic_name| { let c = runner_command(runner) - .arg("intrinsic-test-programs") + .arg("./intrinsic-test-programs") .arg(intrinsic_name) .current_dir("c_programs") .output(); let rust = runner_command(runner) - .arg(format!("target/{target}/release/intrinsic-test-programs")) + .arg(format!("./target/{target}/release/intrinsic-test-programs")) .arg(intrinsic_name) .current_dir("rust_programs") .output(); From ce179da6118ed81ac938db3ca3ab274606647d2d Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Thu, 23 Oct 2025 10:17:33 +0530 Subject: [PATCH 72/73] feat: fixing Rust's print mechanism for _mm512_conj_pch --- crates/intrinsic-test/missing_x86.txt | 2 +- crates/intrinsic-test/src/x86/config.rs | 3 +++ crates/intrinsic-test/src/x86/types.rs | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/crates/intrinsic-test/missing_x86.txt b/crates/intrinsic-test/missing_x86.txt index e546799740..58e37b92a1 100644 --- a/crates/intrinsic-test/missing_x86.txt +++ b/crates/intrinsic-test/missing_x86.txt @@ -890,7 +890,7 @@ _mm256_extract_epi16 _mm256_extract_epi8 _mm512_castsi128_si512 _mm512_castsi256_si512 -_mm512_conj_pch +# _mm512_conj_pch _mm512_mask_reduce_max_pd _mm512_mask_reduce_max_ps _mm512_mask_reduce_min_pd diff --git a/crates/intrinsic-test/src/x86/config.rs b/crates/intrinsic-test/src/x86/config.rs index d7770189eb..7c349e4482 100644 --- a/crates/intrinsic-test/src/x86/config.rs +++ b/crates/intrinsic-test/src/x86/config.rs @@ -235,6 +235,9 @@ macro_rules! impl_debug_as { impl_debug_as!(__m128i, "__m128i", 128, [u8, i8, u16, i16, u32, i32, u64, i64]); impl_debug_as!(__m256i, "__m256i", 256, [u8, i8, u16, i16, u32, i32, u64, i64]); impl_debug_as!(__m512i, "__m512i", 512, [u8, i8, u16, i16, u32, i32, u64, i64]); +impl_debug_as!(__m128h, "__m128h", 128, [f32]); +impl_debug_as!(__m256h, "__m256h", 256, [f32]); +impl_debug_as!(__m512h, "__m512h", 512, [f32]); fn debug_as(x: V) -> impl core::fmt::Debug where V: DebugAs diff --git a/crates/intrinsic-test/src/x86/types.rs b/crates/intrinsic-test/src/x86/types.rs index be15b6dccd..87932fcb3e 100644 --- a/crates/intrinsic-test/src/x86/types.rs +++ b/crates/intrinsic-test/src/x86/types.rs @@ -290,6 +290,12 @@ impl IntrinsicTypeDefinition for X86IntrinsicType { fn print_result_rust(&self) -> String { let return_value = match self.kind() { TypeKind::Float if self.inner_size() == 16 => "debug_f16(__return_value)".to_string(), + TypeKind::Float + if self.inner_size() == 32 + && ["__m512h"].contains(&self.param.type_data.as_str()) => + { + "debug_as::<_, f32>(__return_value)".to_string() + } TypeKind::Int(_) if ["__m128i", "__m256i", "__m512i"].contains(&self.param.type_data.as_str()) => { From 41357a0a542d124c9a23064e6a182596a955c8d1 Mon Sep 17 00:00:00 2001 From: Madhav Madhusoodanan Date: Sun, 26 Oct 2025 18:19:45 +0530 Subject: [PATCH 73/73] feat: added x86_64-unknown-linux-gnu to the test matrix of `intrinsic-test` --- .github/workflows/main.yml | 1 + ci/intrinsic-test.sh | 24 ++++++++++++++++++++++++ ci/run.sh | 5 ----- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b852110a32..28c15cf473 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -260,6 +260,7 @@ jobs: - aarch64_be-unknown-linux-gnu - armv7-unknown-linux-gnueabihf - arm-unknown-linux-gnueabihf + - x86_64-unknown-linux-gnu profile: [dev, release] include: - target: aarch64_be-unknown-linux-gnu diff --git a/ci/intrinsic-test.sh b/ci/intrinsic-test.sh index 469e9e21c7..e14a824b2a 100755 --- a/ci/intrinsic-test.sh +++ b/ci/intrinsic-test.sh @@ -66,6 +66,14 @@ case ${TARGET} in TEST_CXX_COMPILER="clang++" TEST_RUNNER="${CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER}" ;; + + x86_64-unknown-linux-gnu*) + TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/" + TEST_CXX_COMPILER="clang++" + TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" + TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt + TEST_SAMPLE_INTRINSICS_PERCENTAGE=5 + ;; *) ;; @@ -94,6 +102,22 @@ case "${TARGET}" in --linker "${CARGO_TARGET_AARCH64_BE_UNKNOWN_LINUX_GNU_LINKER}" \ --cxx-toolchain-dir "${AARCH64_BE_TOOLCHAIN}" ;; + + x86_64-unknown-linux-gnu*) + # `CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER` is not necessary for `intrinsic-test` + # because the binary needs to run directly on the host. + # Hence the use of `env -u`. + env -u CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER \ + CPPFLAGS="${TEST_CPPFLAGS}" RUSTFLAGS="${HOST_RUSTFLAGS}" \ + RUST_LOG=warn RUST_BACKTRACE=1 \ + cargo run "${INTRINSIC_TEST}" "${PROFILE}" \ + --bin intrinsic-test -- intrinsics_data/x86-intel.xml \ + --runner "${TEST_RUNNER}" \ + --skip "${TEST_SKIP_INTRINSICS}" \ + --cppcompiler "${TEST_CXX_COMPILER}" \ + --target "${TARGET}" \ + --sample-percentage "${TEST_SAMPLE_INTRINSICS_PERCENTAGE}" + ;; *) ;; esac diff --git a/ci/run.sh b/ci/run.sh index 48dfe2a77d..2bb77bae25 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -90,11 +90,6 @@ fi # Test targets compiled with extra features. case ${TARGET} in x86_64-unknown-linux-gnu) - TEST_CPPFLAGS="-fuse-ld=lld -I/usr/include/x86_64-linux-gnu/" - TEST_CXX_COMPILER="clang++" - TEST_RUNNER="${CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER}" - TEST_SKIP_INTRINSICS=crates/intrinsic-test/missing_x86.txt - TEST_SAMPLE_INTRINSICS_PERCENTAGE=5 export STDARCH_DISABLE_ASSERT_INSTR=1 export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"